├── README.md
├── src_culda
├── .gitignore
├── Makefile
├── kernel
│ ├── lda_phi_kernel.cu
│ ├── lda_phi_kernel.h
│ ├── lda_theta_kernel.back.cu
│ ├── lda_theta_kernel.back.h
│ ├── lda_theta_kernel.cu
│ ├── lda_theta_kernel.h
│ ├── lda_train_kernel.cu
│ └── lda_train_kernel.h
├── main.cpp
├── model
│ ├── culda_argument.h
│ ├── doc.cpp
│ ├── doc.h
│ ├── doc_chunk.cpp
│ ├── doc_chunk.h
│ ├── model_phi.cpp
│ ├── model_phi.h
│ ├── model_phi_gpu.cpp
│ ├── model_phi_gpu.h
│ ├── model_theta.cpp
│ ├── model_theta.h
│ ├── model_theta_chunk.cpp
│ ├── model_theta_chunk.h
│ ├── vocab.cpp
│ └── vocab.h
└── train
│ ├── MultiChunkMultiGPUequal.h
│ ├── SingleChunkSingleGPU.h
│ ├── lda_train.cu
│ └── lda_train.h
└── src_format
├── .gitignore
├── Makefile
├── data_chunk.h
└── format.cpp
/README.md:
--------------------------------------------------------------------------------
1 | # CuLDA_CGS
2 |
3 | CuLDA_CGS is GPU solution for CGS-based LDA sampling. It's efficient and is able to achieve 686M tokens/sec. To the best of our knowledge, it's the first LDA solution that support GPUs.
4 |
5 |
6 | ## Input Data Preparation
7 | ./src_format contantions a program to transform to text corpus to the input format of CuLDA_CGS. The transformed data format is more efficient for subsequent processing and partitioned to multiple chunks to support multi-GPU scaling.
8 |
9 | Run Command "make" in the directory and use the following command to transform the data:
10 |
11 | ./format input output_prefix numChunks[default=1]
12 |
13 | The input format of ./format is like:
14 |
15 | doc-name1 token1 token2 token3\n
16 | doc-name2 token4 token5 token6\n
17 | ...
18 |
19 | Tokens are separated by space, documents are separated by line.
20 |
21 | ## Compile and Run CuLDA_CGS
22 | Everything about CuLDA_CGS is in ./src_culda. It does not relies on any 3rd party denpendency. What you need is only a CUDA environment and a CUDA-enabled GPU.
23 |
24 | Before you run command "make" in the directory, remember to change CXX_FLAG to your targeted architecture and change CUDA_INSTALL_PATH to your CUDA directory.
25 |
26 | Then you can run ./culda for LDA sampling, the usage is:
27 |
28 | ./culda [options]
29 |
30 | Possible options
31 |
32 | -g
33 | -k : currently only support 1024
34 | -t
35 | -s : it has been deprecated
36 | -a : 50/1024 for our tested data sets
37 | -b : 0.01 for our tested data sets
38 | -c : must be equal with -g, and must be consistency with the specified chunk number in the data prepration stage
39 | -i : Same with the output_prefix in the data preparation stage.
40 | -o : It's not used now. Rewrite ModelPhi::savePhi and ModelTheta::saveTheta as you need it.
41 |
42 | CuLDA_CGS outputs the number of processed token per sec and the loglikelyhood after each iteration.
43 |
44 |
--------------------------------------------------------------------------------
/src_culda/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | culda
3 | obj/*
4 | Theta*
5 | theta*
6 | Phi*
7 | phi*
8 | Topics*
9 | topics*
10 | *ini
11 |
--------------------------------------------------------------------------------
/src_culda/Makefile:
--------------------------------------------------------------------------------
1 | #go!
2 |
3 |
4 |
5 | CXX = nvcc
6 | CUDA_INSTALL_PATH = /usr/local/cuda
7 | OBJECTIVE = ./obj
8 | CXX_FLAG = -arch=sm_61
9 | #-maxrregcount=32
10 |
11 | #-maxrregcount=32
12 |
13 | DEBUG_FLAG =
14 | #DEBUG_FLAG = -G -g
15 | #DEBUG_FLAG = -DDEBUGPrint
16 |
17 | culda:$(OBJECTIVE)/main.o \
18 | $(OBJECTIVE)/lda_train_kernel.o \
19 | $(OBJECTIVE)/lda_train.o \
20 | $(OBJECTIVE)/lda_theta_kernel.o \
21 | $(OBJECTIVE)/lda_phi_kernel.o \
22 | $(OBJECTIVE)/model_theta.o \
23 | $(OBJECTIVE)/model_theta_chunk.o \
24 | $(OBJECTIVE)/model_phi_gpu.o \
25 | $(OBJECTIVE)/model_phi.o \
26 | $(OBJECTIVE)/vocab.o \
27 | $(OBJECTIVE)/doc.o \
28 | $(OBJECTIVE)/doc_chunk.o
29 | $(CXX) $(OBJECTIVE)/main.o \
30 | $(OBJECTIVE)/model_theta.o \
31 | $(OBJECTIVE)/model_theta_chunk.o \
32 | $(OBJECTIVE)/model_phi.o \
33 | $(OBJECTIVE)/model_phi_gpu.o \
34 | $(OBJECTIVE)/vocab.o \
35 | $(OBJECTIVE)/doc.o \
36 | $(OBJECTIVE)/doc_chunk.o \
37 | $(OBJECTIVE)/lda_train_kernel.o \
38 | $(OBJECTIVE)/lda_train.o \
39 | $(OBJECTIVE)/lda_theta_kernel.o \
40 | $(OBJECTIVE)/lda_phi_kernel.o \
41 | -o culda \
42 | -L $(CUDA_INSTALL_PATH)/lib64 \
43 | $(CXX_FLAG) $(DEBUG_FLAG)
44 |
45 |
46 | $(OBJECTIVE)/main.o:\
47 | main.cpp \
48 | ./model/model_theta.h \
49 | ./model/vocab.h \
50 | ./model/doc.h \
51 | ./model/doc_chunk.h \
52 | ./model/culda_argument.h \
53 | ./train/lda_train.h
54 | $(CXX) -c main.cpp \
55 | -o $(OBJECTIVE)/main.o \
56 | -I $(CUDA_INSTALL_PATH)/include \
57 | -L $(CUDA_INSTALL_PATH)/lib64 \
58 | $(CXX_FLAG) $(DEBUG_FLAG)
59 |
60 | $(OBJECTIVE)/doc.o:\
61 | ./model/doc.cpp \
62 | ./model/doc.h \
63 | ./model/doc_chunk.h
64 | $(CXX) -c ./model/doc.cpp \
65 | -o $(OBJECTIVE)/doc.o \
66 | $(CXX_FLAG) $(DEBUG_FLAG)
67 |
68 | $(OBJECTIVE)/doc_chunk.o:\
69 | ./model/doc_chunk.cpp \
70 | ./model/doc_chunk.h \
71 | ./model/culda_argument.h
72 | $(CXX) -c ./model/doc_chunk.cpp \
73 | -o $(OBJECTIVE)/doc_chunk.o \
74 | $(CXX_FLAG) $(DEBUG_FLAG)
75 |
76 | $(OBJECTIVE)/vocab.o:\
77 | ./model/vocab.cpp \
78 | ./model/vocab.h
79 | $(CXX) -c ./model/vocab.cpp \
80 | -o $(OBJECTIVE)/vocab.o \
81 | $(CXX_FLAG) $(DEBUG_FLAG)
82 |
83 | $(OBJECTIVE)/lda_theta_kernel.o:\
84 | ./kernel/lda_theta_kernel.cu \
85 | ./kernel/lda_theta_kernel.h \
86 | ./kernel/lda_train_kernel.h \
87 | ./model/culda_argument.h
88 | $(CXX) -c ./kernel/lda_theta_kernel.cu \
89 | -o $(OBJECTIVE)/lda_theta_kernel.o \
90 | $(CXX_FLAG) -Xptxas -v $(DEBUG_FLAG)
91 |
92 | $(OBJECTIVE)/lda_train_kernel.o:\
93 | ./kernel/lda_train_kernel.cu \
94 | ./kernel/lda_train_kernel.h \
95 | ./model/culda_argument.h
96 | $(CXX) -c ./kernel/lda_train_kernel.cu \
97 | -o $(OBJECTIVE)/lda_train_kernel.o \
98 | $(CXX_FLAG) -Xptxas -v $(DEBUG_FLAG)
99 |
100 | $(OBJECTIVE)/lda_train.o:\
101 | ./train/lda_train.cu \
102 | ./train/lda_train.h \
103 | ./train/MultiChunkMultiGPUequal.h \
104 | ./train/SingleChunkSingleGPU.h \
105 | ./kernel/lda_train_kernel.h \
106 | ./model/doc.h \
107 | ./model/doc_chunk.h \
108 | ./model/vocab.h \
109 | ./model/model_theta.h \
110 | ./model/model_theta_chunk.h \
111 | ./model/model_phi.h \
112 | ./model/model_phi_gpu.h\
113 | ./model/culda_argument.h
114 | $(CXX) -c ./train/lda_train.cu \
115 | -o $(OBJECTIVE)/lda_train.o \
116 | $(CXX_FLAG) $(DEBUG_FLAG)
117 |
118 | $(OBJECTIVE)/lda_phi_kernel.o:\
119 | ./kernel/lda_phi_kernel.cu \
120 | ./kernel/lda_phi_kernel.h \
121 | ./kernel/lda_train_kernel.h \
122 | ./model/culda_argument.h
123 | $(CXX) -c ./kernel/lda_phi_kernel.cu \
124 | -o $(OBJECTIVE)/lda_phi_kernel.o \
125 | $(CXX_FLAG) $(DEBUG_FLAG)
126 |
127 | $(OBJECTIVE)/model_phi_gpu.o:\
128 | ./model/model_phi_gpu.cpp \
129 | ./model/model_phi_gpu.h \
130 | ./model/doc.h \
131 | ./model/doc_chunk.h \
132 | ./model/vocab.h \
133 | ./model/culda_argument.h \
134 | ./kernel/lda_train_kernel.h
135 | $(CXX) -c ./model/model_phi_gpu.cpp \
136 | -o $(OBJECTIVE)/model_phi_gpu.o \
137 | $(CXX_FLAG) $(DEBUG_FLAG)
138 |
139 | $(OBJECTIVE)/model_phi.o:\
140 | ./model/model_phi.cpp \
141 | ./model/model_phi.h \
142 | ./model/doc.h \
143 | ./model/doc_chunk.h \
144 | ./model/vocab.h \
145 | ./model/culda_argument.h \
146 | ./model/model_phi_gpu.h \
147 | ./kernel/lda_train_kernel.h
148 | $(CXX) -c ./model/model_phi.cpp \
149 | -o $(OBJECTIVE)/model_phi.o \
150 | $(CXX_FLAG) $(DEBUG_FLAG)
151 |
152 | $(OBJECTIVE)/model_theta_chunk.o:\
153 | ./model/model_theta_chunk.cpp \
154 | ./model/model_theta_chunk.h \
155 | ./model/doc.h \
156 | ./model/doc_chunk.h \
157 | ./model/vocab.h \
158 | ./model/culda_argument.h \
159 | ./kernel/lda_train_kernel.h
160 | $(CXX) -c ./model/model_theta_chunk.cpp \
161 | -o $(OBJECTIVE)/model_theta_chunk.o \
162 | $(CXX_FLAG) $(DEBUG_FLAG)
163 |
164 | $(OBJECTIVE)/model_theta.o:\
165 | ./model/model_theta.cpp \
166 | ./model/model_theta.h \
167 | ./model/doc.h \
168 | ./model/doc_chunk.h \
169 | ./model/vocab.h \
170 | ./model/culda_argument.h \
171 | ./model/model_theta_chunk.h \
172 | ./model/culda_argument.h \
173 | ./kernel/lda_train_kernel.h
174 | $(CXX) -c ./model/model_theta.cpp \
175 | -o $(OBJECTIVE)/model_theta.o \
176 | $(CXX_FLAG) $(DEBUG_FLAG)
177 |
178 |
179 | clean:
180 | rm -f culda
181 | rm -f *.o
182 | rm -f obj/*
183 |
--------------------------------------------------------------------------------
/src_culda/kernel/lda_phi_kernel.cu:
--------------------------------------------------------------------------------
1 |
2 | #include
3 |
4 | #include "lda_train_kernel.h"
5 | #include "lda_phi_kernel.h"
6 |
7 | /* phihead comput kernels */
8 | __global__ void LDAcomputePhiHeadKernel(
9 | int k,
10 | float beta,
11 | int numWords,
12 | int numWordsPerWorker,
13 | PHITYPE *phiTopicWordShort,
14 | int *phiTopic,
15 | half *phiHead)
16 | {
17 | int tid = threadIdx.x + blockDim.x*blockIdx.x;
18 | int workerId = tid/32;
19 | int laneId = tid%32;
20 | int wordId = workerId;
21 |
22 | if(workerId >= numWords)return;
23 |
24 | for(int tmpk = laneId; tmpk < k; tmpk += 32){
25 |
26 | float tmpHead = (phiTopicWordShort[wordId*k + tmpk] + beta)/(phiTopic[tmpk] + beta*numWords);
27 |
28 | //if(tmpk = 1024)tmpHead *=1.01;
29 | phiHead[wordId*k + tmpk] = __float2half(tmpHead);
30 | }
31 | }
32 |
33 | __global__ void LDAcheckPhiHeadKernel(
34 | int k,
35 | int numWords,
36 | half *phiHead)
37 | {
38 |
39 | int tid = threadIdx.x + blockDim.x*blockIdx.x;
40 | int wordId = tid/1024;
41 | int tmpk = tid%1024;
42 |
43 | float tmp = __half2float(phiHead[wordId*k + tmpk]);
44 | if(tmp < 0){
45 | printf("phihead check error:wordid(%d), k(%d), head(%.6f), index(%d)\n", wordId, tmpk, tmp, wordId*k + tmpk);
46 | }
47 | }
48 |
49 | void LDAComputePhiHeadAPI(
50 | int k,
51 | float beta,
52 | int numWords,
53 | PHITYPE *phiTopicWordShort,
54 | int *phiTopic,
55 | half *phiHead,
56 | cudaStream_t stream)
57 | {
58 |
59 | //printf("call LDAComputePhiHeadAPI ...\n");
60 | LDAcomputePhiHeadKernel<<<(numWords+3)/4,128,0, stream>>>(
61 | k,
62 | beta,
63 | numWords,
64 | 1,
65 | phiTopicWordShort,
66 | phiTopic,
67 | phiHead
68 | );
69 |
70 | //printf("LDAcheckPhiHeadKernel ...\n");
71 | //LDAcheckPhiHeadKernel<<>>(k, numWords, phiHead);
72 | }
73 |
74 | /* phi update kernels */
75 |
76 | __global__ void LDAUpdatePhiKernel(
77 | int k,
78 | int numWords,
79 | long long *wordIndices,
80 | short *wordTopics,
81 | PHITYPE *phiTopicWordShort,
82 | int *phiTopicWordSub,
83 | int *phiTopic,
84 | int numWorkers)
85 | {
86 |
87 | int tid = threadIdx.x + blockDim.x*blockIdx.x;
88 |
89 | int workerId = tid/32;
90 | int laneId = tid%32;
91 |
92 | if(workerId >= numWorkers) return;
93 |
94 | for(int wordId = workerId; wordId < numWords; wordId += numWorkers){
95 |
96 | long long tokenStart = __ldg(&wordIndices[wordId]);
97 | long long tokenEnd = __ldg(&wordIndices[wordId + 1]);
98 |
99 | //clean
100 | for(int offset = laneId; offset < k;offset += 32)
101 | phiTopicWordSub[workerId*k + offset] = 0;
102 |
103 | //add
104 | for(long long tokenIdx = tokenStart + laneId;
105 | tokenIdx < tokenEnd;
106 | tokenIdx += 32) //iterate over tokens
107 | {
108 | int tmpK = __ldg(&wordTopics[tokenIdx]);
109 | atomicAdd(&(phiTopicWordSub[workerId*k + tmpK]),1);
110 | atomicAdd(&(phiTopic[tmpK]),1);
111 | }
112 |
113 | //transform
114 | for(int offset = laneId; offset < k;offset += 32)
115 | phiTopicWordShort[wordId*k + offset] = phiTopicWordSub[workerId*k + offset];
116 | }
117 | }
118 |
119 | /*
120 | __global__ void LDAPhiCheckKernel(
121 | int k,
122 | int numWords,
123 | PHITYPE *phiTopicWordShort)
124 | {
125 | int tid = threadIdx.x + blockDim.x*blockIdx.x;
126 | if(tid > k*numWords)return;
127 |
128 | if(phiTopicWordShort[tid] < 0)
129 | {
130 | printf("phi check error: word(%d), k(%d), int(%d), short(%d)\n",
131 | tid/k, tid%k, phiTopicWordShort[tid], phiTopicWordShort[tid]);
132 | }
133 | }
134 | */
135 |
136 | void LDAUpdatePhiAPI(
137 | int k,
138 | int numWords,
139 | long long *wordIndices,
140 | short *wordTopics,
141 | PHITYPE *phiTopicWordShort,
142 | int *phiTopicWordSub,
143 | int *phiTopic,
144 | cudaStream_t stream)
145 | {
146 |
147 | LDAUpdatePhiKernel<<<(UpdateNumWorkers+3)/4, 128, 0,stream>>>(
148 | k,
149 | numWords,
150 | wordIndices,
151 | wordTopics,
152 | phiTopicWordShort,
153 | phiTopicWordSub,
154 | phiTopic,
155 | UpdateNumWorkers
156 | );
157 |
158 | //LDAPhiCheckKernel<<<(k*numWords + 127)/128, 128,0, stream>>>(k,numWords,phiTopicWordShort);
159 | }
160 |
161 | /* MultiGPU Reduce Kernels */
162 | __global__ void LDAUpdatePhiReduceKernelShort(
163 | int k,
164 | int numWords,
165 | PHITYPE *phiTopicWordShort,
166 | PHITYPE *phiTopicWordShortCopy)
167 | {
168 | int tid = threadIdx.x + blockIdx.x*blockDim.x;
169 | if(tid < k*numWords) phiTopicWordShort[tid] += phiTopicWordShortCopy[tid];
170 | }
171 |
172 | __global__ void LDAUpdatePhiReduceKernelInt(
173 | int k,
174 | int numWords,
175 | int *phiTopic,
176 | int *phiTopicCopy)
177 | {
178 | int tid = threadIdx.x + blockIdx.x*blockDim.x;
179 | //if(tid == 0)
180 | // printf("phiTopic[0]:%d, phiTopicShort[0]:%d\n", phiTopic[0], phiTopicCopy[0]);
181 |
182 | if(tid < k) phiTopic[tid] += phiTopicCopy[tid];
183 | }
184 |
185 | void LDAUpdatePhiReduceAPI(
186 | int k,
187 | int numWords,
188 | PHITYPE *phiTopicWordShort,
189 | PHITYPE *phiTopicWordShortCopy,
190 | int *phiTopic,
191 | int *phiTopicCopy,
192 | cudaStream_t stream)
193 | {
194 |
195 | LDAUpdatePhiReduceKernelShort<<<(k*numWords + 127)/128,128,0,stream>>>(
196 | k,
197 | numWords,
198 | phiTopicWordShort,
199 | phiTopicWordShortCopy
200 | );
201 |
202 | LDAUpdatePhiReduceKernelInt<<<(k + 127)/128,128,0,stream>>>(
203 | k,
204 | numWords,
205 | phiTopic,
206 | phiTopicCopy
207 | );
208 | }
--------------------------------------------------------------------------------
/src_culda/kernel/lda_phi_kernel.h:
--------------------------------------------------------------------------------
1 | #ifndef _LDA_PHI_KERNEL_H_
2 | #define _LDA_PHI_KERNEL_H_
3 |
4 | #include
5 | #include "../model/culda_argument.h"
6 |
7 |
8 | /* phihead comput kernels */
9 | __global__ void LDAcomputePhiHeadKernel(
10 | int k,
11 | float beta,
12 | int numWords,
13 | int numWordsPerWorker,
14 | PHITYPE *phiTopicWordShort,
15 | int *phiTopic,
16 | half *phiHead
17 | );
18 |
19 | __global__ void LDAcheckPhiHeadKernel(
20 | int k,
21 | int numWords,
22 | half *phiHead
23 | );
24 |
25 | void LDAComputePhiHeadAPI(
26 | int k,
27 | float beta,
28 | int numWords,
29 | PHITYPE *phiTopicWordShort,
30 | int *phiTopic,
31 | half *phiHead,
32 | cudaStream_t stream=0
33 | );
34 |
35 | /* phi update kernels */
36 | __global__ void LDAUpdatePhiKernel(
37 | int k,
38 | int numWords,
39 | long long *wordIndices,
40 | short *wordTopics,
41 | PHITYPE *phiTopicWordShort,
42 | int *phiTopicWordSub,
43 | int *phiTopic,
44 | int numWorkers
45 | );
46 |
47 | void LDAUpdatePhiAPI(
48 | int k,
49 | int numWords,
50 | long long *wordIndices,
51 | short *wordTopics,
52 | PHITYPE *phiTopicWordShort,
53 | int *phiTopicWordSub,
54 | int *phiTopic,
55 | cudaStream_t stream=0
56 | );
57 |
58 | /*
59 | __global__ void LDAPhiCheckKernel(
60 | int k,
61 | int numWords,
62 | PHITYPE *phiTopicWordShort);
63 | */
64 |
65 | /* MultiGPU Reduce Kernels */
66 | __global__ void LDAUpdatePhiReduceKernelShort(
67 | int k,
68 | int numWords,
69 | PHITYPE *phiTopicWordShort,
70 | PHITYPE *phiTopicWordShortCopy
71 | );
72 |
73 | __global__ void LDAUpdatePhiReduceKernelInt(
74 | int k,
75 | int numWords,
76 | int *phiTopic,
77 | int *phiTopicCopy
78 | );
79 |
80 | void LDAUpdatePhiReduceAPI(
81 | int k,
82 | int numWords,
83 | PHITYPE *phiTopicWordShort,
84 | PHITYPE *phiTopicWordShortCopy,
85 | int *phiTopic,
86 | int *phiTopicCopy,
87 | cudaStream_t stream=0
88 | );
89 |
90 | #endif
--------------------------------------------------------------------------------
/src_culda/kernel/lda_theta_kernel.back.cu:
--------------------------------------------------------------------------------
1 | #include "lda_train_kernel.h"
2 | #include "lda_theta_kernel.h"
3 |
4 |
5 | __global__ void LDAUpdateThetaIncreaseKernel(
6 | int k,
7 | int numDocs,
8 | int docIdStart,
9 | int chunkNumDocs,
10 | long long *wordIndices,
11 | int *wordTokens,
12 | short *wordTopics,
13 | long long *docRevIndices,
14 | TokenIdxType *docRevIdx,
15 | short *thetaA,
16 | int *thetaCurIA,
17 | int *thetaMaxIA,
18 | short *thetaJA,
19 | int *denseTheta)
20 | {
21 | int tid = threadIdx.x + blockIdx.x*blockDim.x;
22 | int BlockSize = blockDim.x;
23 |
24 | int workerId = tid/BlockSize;
25 | int laneId = tid%BlockSize;
26 |
27 | if(workerId >= chunkNumDocs)return;
28 |
29 |
30 | int docId = workerId + docIdStart;
31 |
32 |
33 | for(long long idx = docRevIndices[docId] + laneId;
34 | idx < docRevIndices[docId + 1];
35 | idx += BlockSize){
36 | int topic = wordTopics[docRevIdx[idx]];
37 | atomicAdd(&(denseTheta[(docId - docIdStart)*k + topic]), 1);
38 | }
39 |
40 | }
41 |
42 | __global__ void LDAUpdateThetaAlignKernel(
43 | int k,
44 | int numDocs,
45 | int docIdStart,
46 | int chunkNumDocs,
47 | long long *wordIndices,
48 | int *wordTokens,
49 | short *wordTopics,
50 | short *thetaA,
51 | int *thetaCurIA,
52 | int *thetaMaxIA,
53 | short *thetaJA,
54 | int *denseTheta)
55 | {
56 |
57 | int laneId = threadIdx.x%32;
58 | int localId = threadIdx.x/32;
59 | int tid = threadIdx.x + blockIdx.x*blockDim.x;
60 | int workerId = tid/32;
61 |
62 | if(workerId >= chunkNumDocs)return;
63 |
64 | //if( workerId <= 1000 || workerId >= 5000)return;
65 |
66 | volatile __shared__ int shaPrefixSum[64];
67 |
68 | int docId = docIdStart + workerId;
69 |
70 | //if(laneId == 0)printf("docId:%d, IAStart:%d, IACurEnd:%d\n", docId, thetaMaxIA[docId], thetaMaxIA[docId + 1]);
71 | int IAStart = thetaMaxIA[docId];
72 | //compute
73 | int tmpPrefixSum = 0;
74 | for(int i = laneId;i < k;i += 32){
75 |
76 | //read
77 | int tmpVal = denseTheta[(docId - docIdStart)*k + i];
78 | int tmpBin = tmpVal > 0;
79 | shaPrefixSum[localId*32 + laneId] = tmpBin;
80 |
81 | //prefix sum
82 | if(laneId >= 1)
83 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 1] + shaPrefixSum[localId*32 + laneId];
84 | if(laneId >= 2)
85 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 2] + shaPrefixSum[localId*32 + laneId];
86 | if(laneId >= 4)
87 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 4] + shaPrefixSum[localId*32 + laneId];
88 | if(laneId >= 8)
89 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 8] + shaPrefixSum[localId*32 + laneId];
90 | if(laneId >= 16)
91 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 16] + shaPrefixSum[localId*32 + laneId];
92 |
93 |
94 | //
95 | //debug
96 | //if(laneId == 0)
97 | // printf("old tmpPrefixSum:%d\n", tmpPrefixSum);
98 | //printf("laneId:%2d, denseTheta:%d, tmpVal:%d, tmpBin:%d, prefix:%d\n",
99 | // laneId, denseTheta[docId*k + i], tmpVal, tmpBin, shaPrefixSum[laneId]);
100 |
101 | //write
102 |
103 |
104 | int offset = tmpPrefixSum + shaPrefixSum[localId*32 + laneId] - 1;
105 |
106 |
107 | if(tmpVal > 0){
108 |
109 | //printf("blockid:%5d, threadIdx.x:%4d, IAStart + offset:%lld\n", blockIdx.x, threadIdx.x, IAStart + offset);
110 | thetaA[IAStart + offset] = tmpVal;
111 | thetaJA[IAStart + offset] = i;
112 | }
113 |
114 | tmpPrefixSum += shaPrefixSum[localId*32 + 31];
115 |
116 |
117 | //debug
118 | //if(laneId == 0)
119 | // printf("new tmpPrefixSum:%d\n", tmpPrefixSum);
120 | //if(laneId == 0) printf("-------------------------------\n");
121 |
122 | }
123 |
124 |
125 |
126 | if(laneId == 0){
127 | //printf("docId:%d\n", docId);
128 | thetaCurIA[docId] = IAStart + ((tmpPrefixSum + 31)/32*32);
129 | }
130 |
131 |
132 |
133 |
134 | //print for debug
135 | //if(laneId == 0){
136 | // for(int i = 0;i < 32;i ++)
137 | // {
138 | // printf("%4d:",i);
139 | // for(int j = 0;j < 32; j++)
140 | // printf("%d ", denseTheta[docId*k + i*32 + j]);
141 | // printf("\n");
142 | // }
143 | // for(int i = thetaMaxIA[docId]; i < thetaMaxIA[docId + 1];i++){
144 | // printf("%d,JA(%d), A(%d)\n", i, thetaJA[i], thetaA[i]);
145 | // }
146 | //}
147 |
148 | //break;
149 |
150 |
151 | }
152 |
153 |
154 |
155 | void LDAUpdateThetaAPI(
156 | int k,
157 | int numDocs,
158 | int docIdStart,
159 | int chunkNumDocs,
160 | long long *wordIndices,
161 | int *wordTokens,
162 | short *wordTopics,
163 | long long *docRevIndices,
164 | TokenIdxType *docRevIdx,
165 | short *thetaA,
166 | int *thetaCurIA,
167 | int *thetaMaxIA,
168 | short *thetaJA,
169 | int *denseTheta,
170 | cudaStream_t stream)
171 | {
172 |
173 | cudaMemsetAsync(denseTheta, 0, sizeof(int)*chunkNumDocs*k, stream);
174 |
175 | LDAUpdateThetaIncreaseKernel<<>>(
176 | k,
177 | numDocs,
178 | docIdStart,
179 | chunkNumDocs,
180 | wordIndices,
181 | wordTokens,
182 | wordTopics,
183 | docRevIndices,
184 | docRevIdx,
185 | thetaA,
186 | thetaCurIA,
187 | thetaMaxIA,
188 | thetaJA,
189 | denseTheta);
190 |
191 | cudaDeviceSynchronize();
192 | gpuErr(cudaPeekAtLastError());
193 |
194 |
195 | //printf("chunkNumDocs:%d\n",chunkNumDocs);
196 |
197 | //LDAUpdateThetaAlignKernel<<<(chunkNumDocs+1)/2, 64, 0, stream>>>(
198 | LDAUpdateThetaAlignKernel<<<(chunkNumDocs+1)/2, 64, 0, stream>>>(
199 | k,
200 | numDocs,
201 | docIdStart,
202 | chunkNumDocs,
203 | wordIndices,
204 | wordTokens,
205 | wordTopics,
206 | thetaA,
207 | thetaCurIA,
208 | thetaMaxIA,
209 | thetaJA,
210 | denseTheta
211 | );
212 |
213 | cudaDeviceSynchronize();
214 | cudaPeekAtLastError();
215 |
216 | //sleep(10);
217 | //cudaDeviceSynchronize();
218 | gpuErr(cudaPeekAtLastError());
219 |
220 | //exit(0);
221 | }
--------------------------------------------------------------------------------
/src_culda/kernel/lda_theta_kernel.back.h:
--------------------------------------------------------------------------------
1 | #ifndef _LDA_THETA_KERNEL_H_
2 | #define _LDA_THETA_KERNEL_H_
3 |
4 | __global__ void LDAUpdateThetaIncreaseKernel(
5 | int k,
6 | int numDocs,
7 | int docIdStart,
8 | int chunkNumDocs,
9 | long long *wordIndices,
10 | int *wordTokens,
11 | short *wordTopics,
12 | long long *docRevIndices,
13 | TokenIdxType *docRevIdx,
14 | short *thetaA,
15 | int *thetaCurIA,
16 | int *thetaMaxIA,
17 | short *thetaJA,
18 | int *denseTheta);
19 |
20 | __global__ void LDAUpdateThetaAlignKernel(
21 | int k,
22 | int numDocs,
23 | int docIdStart,
24 | int chunkNumDocs,
25 | long long *wordIndices,
26 | int *wordTokens,
27 | short *wordTopics,
28 | short *thetaA,
29 | int *thetaCurIA,
30 | int *thetaMaxIA,
31 | short *thetaJA,
32 | int *denseTheta);
33 |
34 | void LDAUpdateThetaAPI(
35 | int k,
36 | int numDocs,
37 | int docIdStart,
38 | int chunkNumDocs,
39 | long long *wordIndices,
40 | int *wordTokens,
41 | short *wordTopics,
42 | long long *docRevIndices,
43 | TokenIdxType *docRevIdx,
44 | short *thetaA,
45 | int *thetaCurIA,
46 | int *thetaMaxIA,
47 | short *thetaJA,
48 | int *denseTheta,
49 | cudaStream_t stream = 0);
50 |
51 | #endif
--------------------------------------------------------------------------------
/src_culda/kernel/lda_theta_kernel.cu:
--------------------------------------------------------------------------------
1 | #include "lda_train_kernel.h"
2 | #include "lda_theta_kernel.h"
3 |
4 |
5 | __global__ void LDAUpdateThetaKernel(
6 | int k,
7 | int numDocs,
8 | int chunkNumDocs,
9 | int docIdStart,
10 | int docIdEnd,
11 | long long *wordIndices,
12 | int *wordTokens,
13 | short *wordTopics,
14 | long long *docRevIndices,
15 | TokenIdxType *docRevIdx,
16 | short *thetaA,
17 | int *thetaCurIA,
18 | int *thetaMaxIA,
19 | short *thetaJA,
20 | int *denseTheta,
21 | int numThetaWorkers
22 | )
23 | {
24 | volatile __shared__ int shaPrefixSum[64];
25 |
26 | int tid = threadIdx.x + blockIdx.x*blockDim.x;
27 | int workerId = tid/32;
28 | int laneId = threadIdx.x%32;
29 | int localId = threadIdx.x/32;
30 |
31 | if(workerId >= numThetaWorkers)return;
32 |
33 | for(int iteDocId = docIdStart + workerId;
34 | iteDocId < docIdEnd;
35 | iteDocId += numThetaWorkers){
36 |
37 | //clean the array
38 | int startDenseIdx = workerId*k;
39 | int endDenseIdx = workerId*k + k;
40 | for(int denseIdx = startDenseIdx + laneId; denseIdx < endDenseIdx; denseIdx += 32)
41 | denseTheta[denseIdx] = 0;
42 |
43 | //generate the dense array
44 | for(long long idx = docRevIndices[iteDocId] + laneId;
45 | idx < docRevIndices[iteDocId + 1];
46 | idx += 32){
47 | int topic = wordTopics[docRevIdx[idx]];
48 | atomicAdd(&(denseTheta[startDenseIdx + topic]), 1);
49 | }
50 |
51 | //generate the sparse array
52 | int IAStart = thetaMaxIA[iteDocId];
53 | int tmpPrefixSum = 0;
54 |
55 | for(int i = laneId; i < k;i += 32){
56 |
57 | //read
58 | int tmpVal = denseTheta[startDenseIdx + i];
59 | int tmpBin = tmpVal > 0;
60 | shaPrefixSum[localId*32 + laneId] = tmpBin;
61 |
62 | //prefixsum
63 | if(laneId >= 1)
64 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 1] + shaPrefixSum[localId*32 + laneId];
65 | if(laneId >= 2)
66 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 2] + shaPrefixSum[localId*32 + laneId];
67 | if(laneId >= 4)
68 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 4] + shaPrefixSum[localId*32 + laneId];
69 | if(laneId >= 8)
70 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 8] + shaPrefixSum[localId*32 + laneId];
71 | if(laneId >= 16)
72 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 16] + shaPrefixSum[localId*32 + laneId];
73 |
74 |
75 | int offset = tmpPrefixSum + shaPrefixSum[localId*32 + laneId] - 1;
76 | if(tmpVal > 0){
77 | thetaA[IAStart + offset] = tmpVal;
78 | thetaJA[IAStart + offset] = i;
79 | }
80 | tmpPrefixSum += shaPrefixSum[localId*32 + 31];
81 | }
82 |
83 | if(laneId == 0)
84 | thetaCurIA[iteDocId] = IAStart + ((tmpPrefixSum + 31)/32*32);
85 | }
86 |
87 | }
88 |
89 | void LDAUpdateThetaAPI(
90 | int k,
91 | int numDocs,
92 | int chunkNumDocs,
93 | int docIdStart,
94 | int docIdEnd,
95 | long long *wordIndices,
96 | int *wordTokens,
97 | short *wordTopics,
98 | long long *docRevIndices,
99 | TokenIdxType *docRevIdx,
100 | short *thetaA,
101 | int *thetaCurIA,
102 | int *thetaMaxIA,
103 | short *thetaJA,
104 | int *denseTheta,
105 | cudaStream_t stream
106 | )
107 | {
108 | LDAUpdateThetaKernel<<<(UpdateNumWorkers+1)/2,64,0,stream>>>(
109 | k,
110 | numDocs,
111 | chunkNumDocs,
112 | docIdStart,
113 | docIdEnd,
114 | wordIndices,
115 | wordTokens,
116 | wordTopics,
117 | docRevIndices,
118 | docRevIdx,
119 | thetaA,
120 | thetaCurIA,
121 | thetaMaxIA,
122 | thetaJA,
123 | denseTheta,
124 | UpdateNumWorkers
125 | );
126 | }
--------------------------------------------------------------------------------
/src_culda/kernel/lda_theta_kernel.h:
--------------------------------------------------------------------------------
1 | #ifndef _LDA_THETA_KERNEL_H_
2 | #define _LDA_THETA_KERNEL_H_
3 |
4 | #include "../model/culda_argument.h"
5 |
6 | __global__ void LDAUpdateThetaKernel(
7 | int k,
8 | int numDocs,
9 | int chunkNumDocs,
10 | int docIdStart,
11 | int docIdEnd,
12 | long long *wordIndices,
13 | int *wordTokens,
14 | short *wordTopics,
15 | long long *docRevIndices,
16 | TokenIdxType *docRevIdx,
17 | short *thetaA,
18 | int *thetaCurIA,
19 | int *thetaMaxIA,
20 | short *thetaJA,
21 | int *denseTheta,
22 | int numThetaWorkers
23 | );
24 |
25 | void LDAUpdateThetaAPI(
26 | int k,
27 | int numDocs,
28 | int chunkNumDocs,
29 | int docIdStart,
30 | int docIdEnd,
31 | long long *wordIndices,
32 | int *wordTokens,
33 | short *wordTopics,
34 | long long *docRevIndices,
35 | TokenIdxType *docRevIdx,
36 | short *thetaA,
37 | int *thetaCurIA,
38 | int *thetaMaxIA,
39 | short *thetaJA,
40 | int *denseTheta,
41 | cudaStream_t stream = 0
42 | );
43 |
44 |
45 | #endif
--------------------------------------------------------------------------------
/src_culda/kernel/lda_train_kernel.cu:
--------------------------------------------------------------------------------
1 |
2 | #include
3 | #include
4 | #include
5 |
6 | #include
7 | #include
8 |
9 | #include "lda_train_kernel.h"
10 | #include "../model/culda_argument.h"
11 |
12 | using namespace std;
13 |
14 | __global__ void initRandState(curandState *state)
15 | {
16 | int tid = blockIdx.x*blockDim.x + threadIdx.x;
17 | curand_init(clock() + tid, tid, 0,&state[tid]);
18 | }
19 |
20 | __global__ void LDAKernelTrain(
21 | int k, //parameters
22 | float alpha,
23 | float beta,
24 | int numDocs, // corpora
25 | int numWords,
26 | long long numTokens,
27 | long long *wordIndices, // data, numWords + 1
28 | int *slotIdToWordId, // data, numSlots
29 | long long *slotIndices, // data, numSlots*2
30 | int *wordTokens, // data, numTokens
31 | short *wordTopics, // data, numTokens
32 | short *thetaA, //model, values, thetaNNZ
33 | int *thetaMaxIA, //model, offsets, numDocs + 1,
34 | int *thetaCurIA, //model, offsets, numDocs,
35 | short *thetaJA, //model, column indices, thetaNNZ
36 | int docIdStart,
37 | PHITYPE *phiTopicWord, //model, numWords*k
38 | int *phiTopic, //model, k
39 | half *phiHead, //model, numWords*k
40 | curandState *randState,
41 | int randStateSize,
42 | int GPUid,
43 | double *wordPerplexity, //numWords
44 | long long *docRevIndices)
45 | {
46 |
47 |
48 | int tid = threadIdx.x + blockIdx.x*blockDim.x;
49 | int workerId = tid/TrainBlockSize;
50 | int laneId = threadIdx.x%32;
51 | int localId = threadIdx.x/32;
52 |
53 | //samling index
54 | volatile __shared__ float prefixSumQTree[32];
55 | volatile __shared__ float prefixSumSTree[TrainBlockSize/32][32];
56 | volatile __shared__ float prefixSumSample[TrainBlockSize/32][32];
57 |
58 | //cache to store phi.
59 | volatile __shared__ float phiHeadCache[1024 + ShaMemPad + 0];
60 |
61 | int wordId = slotIdToWordId[workerId];
62 | long long tokenStart = __ldg(&slotIndices[workerId*2]);
63 | long long tokenEnd = __ldg(&slotIndices[workerId*2 + 1]);
64 |
65 | //load phi head into cache
66 | int tmpEnd = k/32;
67 |
68 | for(int QIdx = localId; QIdx < tmpEnd; QIdx += TrainBlockSize/32){
69 |
70 | int tmpK = QIdx*32 + laneId;
71 | float tmpVal = __half2float(phiHead[k*wordId + tmpK]);
72 | phiHeadCache[tmpK] = tmpVal;
73 |
74 | tmpVal = alpha*tmpVal;
75 | tmpVal += __shfl_down(tmpVal, 16);
76 | tmpVal += __shfl_down(tmpVal, 8);
77 | tmpVal += __shfl_down(tmpVal, 4);
78 | tmpVal += __shfl_down(tmpVal, 2);
79 | tmpVal += __shfl_down(tmpVal, 1);
80 | tmpVal = __shfl(tmpVal, 0);
81 | prefixSumQTree[QIdx] = tmpVal;
82 | }
83 | __syncthreads();
84 |
85 | //accumulation prefixSumQTree
86 | if(localId == 0){
87 | if(laneId >= 1)
88 | prefixSumQTree[laneId] = prefixSumQTree[laneId - 1] + prefixSumQTree[laneId];
89 | if(laneId >= 2)
90 | prefixSumQTree[laneId] = prefixSumQTree[laneId - 2] + prefixSumQTree[laneId];
91 | if(laneId >= 4)
92 | prefixSumQTree[laneId] = prefixSumQTree[laneId - 4] + prefixSumQTree[laneId];
93 | if(laneId >= 8)
94 | prefixSumQTree[laneId] = prefixSumQTree[laneId - 8] + prefixSumQTree[laneId];
95 | if(laneId >= 16)
96 | prefixSumQTree[laneId] = prefixSumQTree[laneId - 16] + prefixSumQTree[laneId];
97 | }
98 | __syncthreads();
99 | float Q = prefixSumQTree[31];
100 |
101 | float sumPerplexity = 0.0;
102 |
103 | //int stateId = (workerId*TrainBlockSize/32 + localId)%randStateSize;
104 | for(int tokenIdx = tokenStart + localId;
105 | tokenIdx < tokenEnd;
106 | tokenIdx += TrainBlockSize/32) //iterate over tokens
107 | {
108 | int docId = __ldg(&wordTokens[tokenIdx]);
109 |
110 | //computing S.
111 | float S = 0;
112 | int IAStart = __ldg(&thetaMaxIA[docId]); //L1 cache
113 | int IACurEnd = __ldg(&thetaCurIA[docId]); //L1 cache
114 | prefixSumSTree[localId][laneId] = 0;
115 |
116 | for(int tmpIdx = IAStart + laneId, SIdx = 0;
117 | tmpIdx < IACurEnd;
118 | tmpIdx += 32){
119 |
120 | int colVal = __ldg(&thetaA[tmpIdx]); //L1 cache
121 | int colK = __ldg(&thetaJA[tmpIdx]); //L1 cache
122 | //int colVal = thetaA[tmpIdx];
123 | //int colK = thetaJA[tmpIdx];
124 | float tmpP1k = colVal*phiHeadCache[colK];
125 | //go reduce.
126 | tmpP1k += __shfl_down(tmpP1k, 16);
127 | tmpP1k += __shfl_down(tmpP1k, 8);
128 | tmpP1k += __shfl_down(tmpP1k, 4);
129 | tmpP1k += __shfl_down(tmpP1k, 2);
130 | tmpP1k += __shfl_down(tmpP1k, 1);
131 | tmpP1k = __shfl(tmpP1k, 0);
132 |
133 | S += tmpP1k;
134 | prefixSumSTree[localId][SIdx] = S;
135 | SIdx ++;
136 | }
137 | S = __shfl(S,0);
138 |
139 | //randomly generate u.
140 | float u;
141 | if(laneId == 0)u = curand_uniform(&(randState[workerId%randStateSize]));
142 | u = __shfl(u, 0);
143 | int newZ = 0;
144 |
145 | if(u < S/(S+Q))
146 | {
147 |
148 | //totalS ++;
149 | //tmpClock = clock64();
150 |
151 | float transU = u*(S+Q);
152 |
153 | float tmpSumHigh, tmpSumLow = 0.0;
154 | tmpSumHigh = prefixSumSTree[localId][laneId];
155 | tmpSumLow = __shfl_up(tmpSumHigh, 1, 32);
156 | if(laneId == 0)tmpSumLow = 0;
157 |
158 | int voteFlag = 0;
159 | if(transU < tmpSumHigh) voteFlag = 1;
160 | int lvl1Idx = __ffs(__ballot(voteFlag)) - 1;
161 |
162 | int overflowFlag = 0;
163 |
164 | if(lvl1Idx < 0) lvl1Idx = (IACurEnd - IAStart)/32 - 1;
165 |
166 | //float originalU = transU;
167 | transU = transU - tmpSumLow;
168 | transU = __shfl(transU, lvl1Idx);
169 |
170 | int tmpIdx = IAStart + lvl1Idx*32 + laneId;
171 | int tmpNewZ = __ldg(&thetaJA[tmpIdx]);
172 | int colVal = __ldg(&thetaA[tmpIdx]);
173 | float p1k = colVal*phiHeadCache[tmpNewZ];
174 |
175 | prefixSumSample[localId][laneId] = p1k;
176 |
177 | if(laneId >= 1) prefixSumSample[localId][laneId] =
178 | prefixSumSample[localId][laneId - 1] + prefixSumSample[localId][laneId];
179 | if(laneId >= 2) prefixSumSample[localId][laneId] =
180 | prefixSumSample[localId][laneId - 2] + prefixSumSample[localId][laneId];
181 | if(laneId >= 4) prefixSumSample[localId][laneId] =
182 | prefixSumSample[localId][laneId - 4] + prefixSumSample[localId][laneId];
183 | if(laneId >= 8) prefixSumSample[localId][laneId] =
184 | prefixSumSample[localId][laneId - 8] + prefixSumSample[localId][laneId];
185 | if(laneId >= 16)prefixSumSample[localId][laneId] =
186 | prefixSumSample[localId][laneId - 16] + prefixSumSample[localId][laneId];
187 |
188 | float tmpSum = prefixSumSample[localId][laneId];
189 |
190 | voteFlag = 0;
191 | if(transU < tmpSum) voteFlag = 1;
192 | int offset = __ffs(__ballot(voteFlag)) - 1;
193 |
194 | //int offset1 = offset;
195 | //offset = 31 - __clz(__ballot(colVal>0));
196 | newZ = __shfl(tmpNewZ, offset);
197 |
198 | }
199 | else //bucket Q
200 | {
201 |
202 | float transU = (u - S/(S+Q))*(S+Q);
203 | //totalQ ++;
204 | //float originalU = transU;
205 |
206 | //level 1: decide position
207 | float tmpSumHigh, tmpSumLow = 0.0;
208 | tmpSumHigh = prefixSumQTree[laneId];
209 | tmpSumLow = __shfl_up(tmpSumHigh, 1, 32);
210 | if(laneId == 0)tmpSumLow = 0;
211 |
212 | //voting for lvl1Idx
213 | int voteFlag = 0;
214 | if(transU < tmpSumHigh) voteFlag = 1; //voteFlag = transU < tmpSumHigh;
215 | int lvl1Idx = __ffs(__ballot(voteFlag)) - 1;
216 | if(lvl1Idx < 0) lvl1Idx = 31;
217 |
218 |
219 |
220 | float originalU = transU;
221 | transU = transU - tmpSumLow;
222 | transU = __shfl(transU, lvl1Idx);
223 |
224 | prefixSumSample[localId][laneId] = alpha*phiHeadCache[32*lvl1Idx + laneId];
225 |
226 | // accumulation
227 | if(laneId >= 1) prefixSumSample[localId][laneId] =
228 | prefixSumSample[localId][laneId - 1] + prefixSumSample[localId][laneId];
229 | if(laneId >= 2) prefixSumSample[localId][laneId] =
230 | prefixSumSample[localId][laneId - 2] + prefixSumSample[localId][laneId];
231 | if(laneId >= 4) prefixSumSample[localId][laneId] =
232 | prefixSumSample[localId][laneId - 4] + prefixSumSample[localId][laneId];
233 | if(laneId >= 8) prefixSumSample[localId][laneId] =
234 | prefixSumSample[localId][laneId - 8] + prefixSumSample[localId][laneId];
235 | if(laneId >= 16)prefixSumSample[localId][laneId] =
236 | prefixSumSample[localId][laneId - 16] + prefixSumSample[localId][laneId];
237 |
238 | voteFlag = 0;
239 | tmpSumLow = 0;
240 | tmpSumHigh = prefixSumSample[localId][laneId];
241 | tmpSumLow = __shfl_up(tmpSumHigh, 1, 32);
242 |
243 | if(laneId == 0)tmpSumLow = 0;
244 |
245 | if( transU < tmpSumHigh)voteFlag = 1; //voteFlag = transU < tmpSumHigh;
246 | int lvl2Idx = __ffs(__ballot(voteFlag)) - 1;
247 |
248 | if(lvl2Idx < 0)lvl2Idx = 31;
249 |
250 | newZ = lvl1Idx*32 + lvl2Idx;
251 |
252 | //if(tmpFlag == 1)return;
253 | }
254 |
255 | //update & get perplexity
256 | if(laneId == 0){
257 | wordTopics[tokenIdx] = newZ;
258 | sumPerplexity += log((S+Q)/(docRevIndices[docId + 1] - docRevIndices[docId] + k*alpha));
259 | }
260 | }
261 | if(threadIdx.x%32 == 0)
262 | wordPerplexity[(threadIdx.x+blockDim.x*blockIdx.x)/32] = sumPerplexity;
263 |
264 | /*
265 | float totalTime = (clock64() - startClock)/1000000000.0;
266 | if(GPUid == 0 && laneId == 0 && localId == 0 && workerId < -1){
267 | //printf("worker id:%5d, time: %.2fB S1time: %.2fB S2time: %.2fB Qtime: %.2fB, other: %.2fB, innerLoopTime: %.2fB\n", workerId, totalTime, S1Time, S2Time, QTime, otherTime, innerLoopTime);
268 | printf("worker id:%5d, time: %.2fB\n", workerId, totalTime);
269 | }
270 | */
271 | }
272 |
273 | __global__ void LDATrainPerplexityReduce1(double *perplexity, double *perplexityMid, int numVals){
274 |
275 |
276 | int numWarps = gridDim.x*blockDim.x/32;
277 | int tid = threadIdx.x + blockIdx.x*blockDim.x;
278 | int warpId = tid/32;
279 | int laneId = tid%32;
280 |
281 |
282 | int perWarpSize = ((numVals + numWarps - 1)/numWarps + 31)/32*32;
283 | int startIdx = perWarpSize*warpId + laneId;
284 | int endIdx = perWarpSize*warpId + perWarpSize;
285 |
286 | double totalProd = 0;
287 | for(long long i = startIdx;i < endIdx; i += 32){
288 |
289 | int tmpProd = 0;
290 | if(i < numVals)tmpProd = perplexity[i];
291 |
292 | tmpProd += __shfl_down(tmpProd, 16);
293 | tmpProd += __shfl_down(tmpProd, 8);
294 | tmpProd += __shfl_down(tmpProd, 4);
295 | tmpProd += __shfl_down(tmpProd, 2);
296 | tmpProd += __shfl_down(tmpProd, 1);
297 |
298 | totalProd += tmpProd;
299 | }
300 |
301 | if(laneId == 0) perplexityMid[warpId] = totalProd;
302 | }
303 |
304 | __global__ void LDATrainPerplexityReduce2(double *perplexityMid)
305 | {
306 |
307 | double sum = 0;
308 | for(int i = threadIdx.x; i < ReduceParameter; i += 32){
309 | double tmpProd = perplexityMid[i];
310 |
311 | tmpProd += __shfl_down(tmpProd, 16);
312 | tmpProd += __shfl_down(tmpProd, 8);
313 | tmpProd += __shfl_down(tmpProd, 4);
314 | tmpProd += __shfl_down(tmpProd, 2);
315 | tmpProd += __shfl_down(tmpProd, 1);
316 |
317 | sum += tmpProd;
318 | }
319 |
320 | if(threadIdx.x == 0)perplexityMid[0] = sum;
321 | }
322 |
323 | double LDATrainPerplexity(Document &doc, cudaStream_t *streams)
324 | {
325 |
326 | double tmpSum[MaxNumGPU];
327 | double sum = 0;
328 | if(streams == NULL){
329 |
330 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++){
331 |
332 | cudaSetDevice(chunkId);
333 |
334 | //reduce 1.
335 | LDATrainPerplexityReduce1<<>>(
336 | doc.docChunkVec[chunkId]->deviceWordPerplexity,
337 | doc.docChunkVec[chunkId]->deviceWordPerplexityMid,
338 | doc.docChunkVec[chunkId]->numWords*(TrainBlockSize/32));
339 |
340 | double testMid[ReduceParameter];
341 | cudaMemcpy(testMid, doc.docChunkVec[chunkId]->deviceWordPerplexityMid, sizeof(double)*ReduceParameter, cudaMemcpyDeviceToHost);
342 |
343 | //cudaDeviceSynchronize();
344 | //gpuErr(cudaPeekAtLastError());
345 |
346 | //reduce 2.
347 | LDATrainPerplexityReduce2<<<1,32,0>>>(doc.docChunkVec[chunkId]->deviceWordPerplexityMid);
348 |
349 | cudaMemcpy(tmpSum, doc.docChunkVec[chunkId]->deviceWordPerplexityMid, sizeof(double), cudaMemcpyDeviceToHost);
350 |
351 |
352 | sum += tmpSum[0];
353 | //printf("loglike:%.4f e10\n", sum);
354 | }
355 | return (sum/doc.numTokens);
356 | }
357 | else
358 | {
359 |
360 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++)
361 | {
362 | cudaSetDevice(chunkId);
363 | //reduce 1.
364 | LDATrainPerplexityReduce1<<>>(
365 | doc.docChunkVec[chunkId]->deviceWordPerplexity,
366 | doc.docChunkVec[chunkId]->deviceWordPerplexityMid,
367 | doc.docChunkVec[chunkId]->numWords);
368 |
369 | //cudaDeviceSynchronize();
370 | //gpuErr(cudaPeekAtLastError());
371 |
372 | //reduce 2.
373 | LDATrainPerplexityReduce2<<<1,32,0, streams[chunkId]>>>(doc.docChunkVec[chunkId]->deviceWordPerplexityMid);
374 | cudaMemcpyAsync(tmpSum + chunkId, doc.docChunkVec[chunkId]->deviceWordPerplexityMid, sizeof(double), cudaMemcpyDeviceToHost, streams[chunkId]);
375 | }
376 |
377 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++)
378 | cudaStreamSynchronize(streams[chunkId]);
379 |
380 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++)
381 | sum += tmpSum[chunkId];
382 |
383 | return sum/doc.numTokens;
384 | }
385 | //return exp(-1*sum/doc.numTokens);
386 | }
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
--------------------------------------------------------------------------------
/src_culda/kernel/lda_train_kernel.h:
--------------------------------------------------------------------------------
1 | #ifndef _LDA_TRAIN_KERNEL_H_
2 | #define _LDA_TRAIN_KERNEL_H_
3 |
4 |
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 |
11 | #include "../model/culda_argument.h"
12 | #include "../model/doc.h"
13 |
14 |
15 |
16 | __global__ void initRandState(curandState *state);
17 |
18 |
19 | __global__ void LDAKernelTrain(
20 | int k, //parameters
21 | float alpha,
22 | float beta,
23 | int numDocs, // corpora
24 | int numWords,
25 | long long numTokens,
26 | long long *wordIndices, // data, numWords + 1
27 | int *slotIdToWordId, // data, numSlots
28 | long long *slotIndices, // data, numSlots*2
29 | int *wordTokens, // data, numTokens
30 | short *wordTopics, // data, numTokens
31 | short *thetaA, //model, values, thetaNNZ
32 | int *thetaMaxIA, //model, offsets, numDocs + 1,
33 | int *thetaCurIA, //model, offsets, numDocs,
34 | short *thetaJA, //model, column indices, thetaNNZ
35 | int docIdStart,
36 | PHITYPE *phiTopicWord, //model, numWords*k
37 | int *phiTopic, //model, k
38 | half *phiHead, //model, numWords*k
39 | curandState *randState,
40 | int randStateSize,
41 | int GPUid,
42 | double *wordPerplexity,
43 | long long *docRevIndices);
44 |
45 | double LDATrainPerplexity(Document &, cudaStream_t *streams = NULL);
46 |
47 | #endif
--------------------------------------------------------------------------------
/src_culda/main.cpp:
--------------------------------------------------------------------------------
1 |
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 |
15 | #include
16 |
17 | #include "./model/model_theta.h"
18 | #include "./model/culda_argument.h"
19 | #include "./model/vocab.h"
20 | #include "./model/doc.h"
21 |
22 | #include "./train/lda_train.h"
23 |
24 |
25 | using namespace std;
26 |
27 |
28 | bool ISNumerical(char *str)
29 | {
30 | int c = 0;
31 | while(*str != '\0')
32 | {
33 | if(isdigit(*str))c++;
34 | else return false;
35 | str++;
36 | }
37 | return c > 0;
38 | }
39 |
40 | Argument ParseArgument(int argc, char **argv)
41 | {
42 | vector args;
43 | for(int i = 0;i < argc; i++){
44 | args.push_back(string(argv[i]));
45 | }
46 |
47 | if(argc == 1)
48 | throw invalid_argument("No argument found");
49 |
50 | Argument argument;
51 | int i;
52 |
53 | for(i = 1;i < argc; i++){
54 |
55 | if(args[i].compare("-g") == 0){
56 |
57 | if((i + 1) >= argc)throw invalid_argument("need to specify a value after -g");
58 | if(!ISNumerical(argv[i+1]))
59 | throw invalid_argument("-k should be followed by a positive integer");
60 | argument.numGPUs = atoi(argv[i+1]);
61 | i++;
62 | }
63 | else if(args[i].compare("-k") == 0){
64 |
65 | if((i + 1) >= argc)throw invalid_argument("need to specify a value after -k");
66 | if(!ISNumerical(argv[i+1]))
67 | throw invalid_argument("-k should be followed by a positive integer");
68 | argument.k = atoi(argv[i+1]);
69 |
70 | //TBD: check k
71 | i++;
72 | }
73 | else if(args[i].compare("-t") == 0){
74 | if((i + 1) >= argc)throw invalid_argument("need to specify a value after -t");
75 | if(!ISNumerical(argv[i+1]))
76 | throw invalid_argument("-t should be followed by a positive integer");
77 | argument.iteration = atoi(argv[i+1]);
78 |
79 | //TBD: check t
80 | i++;
81 | }
82 | else if(args[i].compare("-s") == 0){
83 | if((i + 1) >= argc)throw invalid_argument("need to specify a value after -s");
84 | if(!ISNumerical(argv[i+1]))
85 | throw invalid_argument("-s should be followed by a positive integer");
86 | argument.numWorkers = atoi(argv[i+1]);
87 |
88 | i++;
89 | }
90 | else if(args[i].compare("-a") == 0){
91 | if((i + 1) >= argc)throw invalid_argument("need to specify a value after -a");
92 |
93 | argument.alpha = atof(argv[i+1]);
94 |
95 | //TBD: check it
96 | i++;
97 | }
98 | else if(args[i].compare("-b") == 0){
99 | if((i + 1) >= argc)throw invalid_argument("need to specify a value after -b");
100 | //if(!ISNumerical(argv[i+1]))
101 | // throw invalid_argument("-b should be followed by a number");
102 | argument.beta = atof(argv[i+1]);
103 |
104 | //TBD: check it
105 | i++;
106 | }
107 | else if(args[i].compare("-c") == 0){
108 | if((i + 1) >= argc)throw invalid_argument("need to specify a value after -b");
109 | //if(!ISNumerical(argv[i+1]))
110 | // throw invalid_argument("-b should be followed by a number");
111 | argument.numChunks = atoi(argv[i+1]);
112 |
113 | //TBD: check it
114 | i++;
115 | }
116 | else if(args[i].compare("-i") == 0){
117 | if((i + 1) >= argc)throw invalid_argument("need to specify a file name prefix after -i");
118 | argument.inputFilePrefix = args[i+1];
119 | i++;
120 | }
121 | else if(args[i].compare("-o") == 0){
122 | if((i + 1) >= argc)throw invalid_argument("need to specify a file name prefix after -o");
123 | argument.outputFilePrefix = args[i+1];
124 | i++;
125 | }
126 | else break;
127 |
128 | }
129 |
130 |
131 | //process k
132 | if(argument.k%32 != 0){
133 | printf("Warning: number of topics(k) has been rounded to multiples of 32.\n");
134 | argument.k = (argument.k + 31)/32*32;
135 | }
136 |
137 |
138 | if (argument.numWorkers <= 0){
139 | printf("Warning: wrong number of workers.\n");
140 | argument.numWorkers = 1;
141 | }
142 |
143 | //process output file names.
144 | argument.outputWordFileName = argument.outputFilePrefix + ".word.full.txt";
145 | argument.outputDocFileName = argument.outputFilePrefix + ".doc.full.txt";
146 |
147 | //GPU number
148 | int deviceCount = 1;
149 | cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
150 |
151 | if(deviceCount >= MaxNumGPU)deviceCount = MaxNumGPU;
152 |
153 | if(argument.numGPUs <= 0)argument.numGPUs = 1;
154 | if(argument.numGPUs > deviceCount){
155 | printf("Warning: number of GPUs(%d) is larger than device count(%d), rounded to %d\n", argument.numGPUs,deviceCount,deviceCount);
156 | argument.numGPUs = deviceCount;
157 | }
158 |
159 |
160 | return argument;
161 | }
162 |
163 |
164 | int main(int argc, char**argv)
165 | {
166 | clock_t clockStart;
167 | Argument argument;
168 |
169 | printf("Parsing arguments ...\n");
170 | try{
171 | argument = ParseArgument(argc, argv);
172 | }
173 | catch(invalid_argument &e){
174 | cout << "Error: " <
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | #include
13 |
14 |
15 | const int SCacheSize = 64;
16 |
17 | const int UpdateNumWorkers = 28*16;
18 | /*/
19 | Optimal on Titan X: 24*16
20 | Optimal on P100 : 56*16
21 | Optimal on V100 : 80*16
22 | */
23 |
24 |
25 | const int TrainBlockSize = 1024;
26 | const int NumConWorkers = 28*2;
27 | const int ShaMemPad = 0;
28 |
29 |
30 | const int MaxNumGPU = 32;
31 | const int ReduceParameter = 1024;
32 |
33 | //typedef unsigned short PHITYPE;
34 | typedef int PHITYPE;
35 |
36 | typedef int TokenIdxType;
37 |
38 | using namespace std;
39 |
40 | #define gpuErr(ans) { gpuAssert((ans), __FILE__, __LINE__); }
41 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
42 | {
43 | if (code != cudaSuccess)
44 | {
45 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
46 | if (abort) exit(code);
47 | }
48 | }
49 |
50 | class Argument
51 | {
52 | public:
53 | int numGPUs;
54 | int k;
55 | int iteration;
56 | int numWorkers;
57 | int numChunks;
58 | std::string inputFilePrefix;
59 |
60 | std::string outputFilePrefix;
61 |
62 | std::string outputWordFileName;
63 | std::string outputDocFileName;
64 |
65 | float alpha;
66 | float beta;
67 |
68 |
69 | void printArgument(){
70 |
71 | printf("numGPUs :%d\n", numGPUs);
72 | printf("k :%d\n", k);
73 | printf("iteration :%d\n", iteration);
74 | printf("numWorkers :%d\n", numWorkers);
75 | printf("numChunks :%d\n", numChunks);
76 | printf("alpha :%.2f\n", alpha);
77 | printf("beta :%.2f\n", beta);
78 | printf("prefix :%s\n", inputFilePrefix.c_str());
79 | printf("outfile :%s\n", outputFilePrefix.c_str());
80 | printf("\n");
81 | }
82 | };
83 |
84 | #endif
--------------------------------------------------------------------------------
/src_culda/model/doc.cpp:
--------------------------------------------------------------------------------
1 |
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | #include
12 |
13 | #include "doc.h"
14 |
15 | using namespace std;
16 | //Definitions of Class Document's methods.
17 |
18 |
19 | Document::Document():
20 | numDocs(0),
21 | numWords(0),
22 | numTokens(0),
23 | docIndices(NULL),
24 | docLength(NULL)
25 | {
26 | }
27 |
28 | Document::Document(const string &filePrefix, int argNumChunks):
29 | numDocs(0),
30 | numWords(0),
31 | numTokens(0),
32 | docIndices(NULL),
33 | docLength(NULL)
34 | {
35 | numChunks = argNumChunks;
36 | loadDocument(filePrefix, argNumChunks);
37 | }
38 |
39 | Document::Document(const Document &doc):
40 | numDocs(0),
41 | numWords(0),
42 | numTokens(0),
43 | docIndices(NULL),
44 | docLength(NULL)
45 | {
46 |
47 | //TBD!!!
48 | /*
49 | numWords = doc.numWords;
50 | numTokens = doc.numTokens;
51 | ids = doc.ids;
52 |
53 | tokens = new int[numTokens];
54 | indices = new long long[numWords*2];
55 |
56 | copy(doc.tokens, doc.tokens + numTokens, tokens);
57 | copy(doc.indices, doc.indices + numWords*2, indices);
58 | */
59 | }
60 |
61 |
62 | void Document::loadDocument(const string &filePrefix, int argNumChunks)
63 | {
64 | numChunks = argNumChunks;
65 |
66 | /* Load docIndices and docLength*/
67 | string docIdxFileName = filePrefix + ".doc.idx";
68 | ifstream docIdxStream(docIdxFileName.c_str(), ios::in);
69 | if(!docIdxStream.is_open()){
70 | cout << "File " << docIdxFileName << " open failed" << endl;
71 | exit(0);
72 | }
73 |
74 | numDocs = 0;
75 | int docId;
76 | long long endIdx;
77 | vector docIndicesVec;
78 | docIndicesVec.push_back(0);
79 | while(docIdxStream >> docId >> endIdx)
80 | {
81 | numDocs ++;
82 | docIndicesVec.push_back(endIdx);
83 | }
84 | docIdxStream.close();
85 |
86 | docIndices = new long long[numDocs + 1];
87 | docLength = new int[numDocs];
88 |
89 | for(int i = 0;i < numDocs; i++)
90 | docLength[i] = docIndicesVec[i + 1] - docIndicesVec[i];
91 |
92 | for(int i = 0;i < numDocs + 1; i++)
93 | docIndices[i] = docIndicesVec[i];
94 |
95 | docIndicesVec.clear();
96 |
97 | //load .chunk meta data.
98 | vector chunkDocVec;
99 | ifstream chunkFileStream((filePrefix + ".chunk").c_str(), ios::in);
100 | if(!chunkFileStream.is_open()){
101 | cout << "File " << filePrefix << ".chunk open failed" << endl;
102 | exit(0);
103 | }
104 | chunkDocVec.push_back(0);
105 | int tmp1, tmp2;
106 | while(chunkFileStream >> tmp1 >> tmp2){
107 | chunkDocVec.push_back(tmp2);
108 | }
109 | chunkFileStream.close();
110 |
111 | if(chunkDocVec.size() != (numChunks + 1)){
112 | printf("Error: numChunks(%d) does not match the chunk file\n",numChunks);
113 | exit(0);
114 | }
115 |
116 | //load data
117 | for(int chunkId = 0;chunkId < numChunks; chunkId ++){
118 | docChunkVec.push_back(
119 | new DocChunk(
120 | chunkId,
121 | chunkDocVec[chunkId],
122 | chunkDocVec[chunkId + 1],
123 | numDocs,
124 | numChunks));
125 | }
126 |
127 | for(int chunkId = 0; chunkId < numChunks; chunkId ++){
128 |
129 | stringstream tmpStream;
130 | string wordIdxFileName, wordDataFileName;
131 | if(numChunks == 1){
132 | tmpStream << filePrefix << ".word.idx";
133 | tmpStream >> wordIdxFileName;
134 | tmpStream.clear();
135 | tmpStream << filePrefix << ".word.data";
136 | tmpStream >> wordDataFileName;
137 | }
138 | else{
139 | tmpStream << filePrefix << ".word.idx" << chunkId;
140 | tmpStream >> wordIdxFileName;
141 | tmpStream.clear();
142 | tmpStream << filePrefix << ".word.data" << chunkId;
143 | tmpStream >> wordDataFileName;
144 | }
145 | docChunkVec[chunkId]->loadChunk(wordIdxFileName, wordDataFileName, docLength);
146 |
147 | printf(" chunk %d loaded ...\n", chunkId);
148 | }
149 | numWords = docChunkVec[0]->numWords;
150 | printf("\n");
151 |
152 | numTokens = 0;
153 | for(int chunkId = 0; chunkId < numChunks; chunkId ++)
154 | numTokens += docChunkVec[chunkId]->chunkNumTokens;
155 |
156 | }
157 |
158 | void Document::generateTopics(int k) //TBD: parallelization
159 | {
160 |
161 | printf("Initialize the topic for tokens ...\n\n");
162 | for(int chunkId = 0; chunkId < numChunks; chunkId ++)
163 | docChunkVec[chunkId]->generateTopics(k);
164 | }
165 |
166 | void Document::clear()
167 | {
168 | numDocs = 0;
169 | numWords = 0;
170 | numTokens = 0;
171 |
172 | for(int chunkId = 0; chunkId < numChunks;chunkId ++)
173 | delete docChunkVec[chunkId];
174 |
175 | if(docIndices != NULL) delete []docIndices;
176 | if(docLength != NULL) delete []docLength;
177 |
178 | docIndices = NULL;
179 | docLength = NULL;
180 | }
181 |
182 | void Document::printDocumentAbbr()
183 | {
184 | printf("numDocs : %d\n",numDocs);
185 | printf("numWords : %d\n",numWords);
186 | printf("numTokens: %lld\n",numTokens);
187 |
188 | printf("\n");
189 | }
190 |
191 |
192 | void Document::printDocumentFull()
193 | {
194 | printf("----doc info-----\n");
195 | printf("numDocs : %d\n",numDocs);
196 | printf("numWords : %d\n",numWords);
197 | printf("numTokens: %lld\n",numTokens);
198 |
199 | for(int chunkId = 0; chunkId < numChunks; chunkId ++){
200 | printf("**chunkId:%d\n", chunkId);
201 | printf("word range:\n");
202 | for(long long i = 0;i < numWords; i++)
203 | printf("word%2d %6lld - %lld\n",i,
204 | docChunkVec[chunkId]->wordIndices[i],
205 | docChunkVec[chunkId]->wordIndices[i+1]);
206 |
207 | printf("token list:\n");
208 | printf("tokenId, wordId, DocId, topics\n");
209 | for(long long wordId = 0;wordId < numWords; wordId++){
210 |
211 | long long start = docChunkVec[chunkId]->wordIndices[wordId];
212 | long long end = docChunkVec[chunkId]->wordIndices[wordId + 1];
213 | for(long long tokenId = start; tokenId < end; tokenId ++)
214 | printf("%lld, %d, %d, %d\n",tokenId, wordId,
215 | docChunkVec[chunkId]->wordTokens[tokenId],
216 | docChunkVec[chunkId]->wordTopics[tokenId]);
217 | }
218 | }
219 |
220 | /*
221 | for(long long i = 0;i < numTokens;i++)
222 | printf("%4lld %lld %lld\n", i, wordTokens[i], wordTopics[i]);
223 | */
224 |
225 | }
226 |
227 | void Document::saveTopics(string fileName)
228 | {
229 | printf("Saving topics ...\n");
230 | }
231 |
232 |
233 |
234 |
--------------------------------------------------------------------------------
/src_culda/model/doc.h:
--------------------------------------------------------------------------------
1 | #ifndef _DOC_H_
2 |
3 | #define _DOC_H_
4 |
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 |
12 | #include "doc_chunk.h"
13 |
14 | using namespace std;
15 |
16 | /*
17 | * word-first format
18 | * fileName.word.data: docId list
19 | * fileName.word.idx: each line is consisted of: wordId, startIdx, endIdx;
20 | * fileName.vocab: word-2-id mapping
21 | */
22 |
23 |
24 |
25 | class Document
26 | {
27 |
28 | public:
29 |
30 | int numDocs;
31 | int numWords;
32 | long long numTokens;
33 | int numChunks;
34 | int numWorkers;
35 |
36 | long long *docIndices; // numDocs + 1
37 | int *docLength; // numDocs
38 |
39 | vector docChunkVec;
40 |
41 | Document();
42 | Document(const string &filePrefix, int argNumChunks);
43 | Document(const Document &doc);
44 |
45 | void loadDocument(const string &filePrefix, int argNumChunks);
46 |
47 | void clear();
48 |
49 | ~Document(){ clear(); }
50 |
51 | void printDocumentAbbr();
52 | void printDocumentFull();
53 | void generateTopics(int k);
54 | void saveTopics(string fileName);
55 |
56 | };
57 |
58 |
59 | #endif
--------------------------------------------------------------------------------
/src_culda/model/doc_chunk.cpp:
--------------------------------------------------------------------------------
1 |
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | #include "doc_chunk.h"
12 |
13 | using namespace std;
14 |
15 | DocChunk::DocChunk():
16 | wordIndices(NULL),
17 | slotIdToWordId(NULL),
18 | slotIndices(NULL),
19 | wordTokens(NULL),
20 | wordTopics(NULL),
21 | wordPerplexity(NULL),
22 | deviceWordIndices(NULL),
23 | deviceSlotIdToWordId(NULL),
24 | deviceSlotIndices(NULL),
25 | deviceWordTokens(NULL),
26 | deviceWordTopics(NULL),
27 | deviceWordPerplexity(NULL),
28 | deviceWordPerplexityMid(NULL),
29 | docRevIndices(NULL),
30 | docRevIdx(NULL),
31 | deviceDocRevIndices(NULL),
32 | deviceDocRevIdx(NULL)
33 | {
34 | }
35 |
36 | DocChunk::DocChunk(int argChunkId, int argDocIdStart, int argDocIdEnd, int argNumDocs, int argNumChunks):
37 | wordIndices(NULL),
38 | slotIdToWordId(NULL),
39 | slotIndices(NULL),
40 | wordTokens(NULL),
41 | wordTopics(NULL),
42 | wordPerplexity(NULL),
43 | deviceWordIndices(NULL),
44 | deviceSlotIdToWordId(NULL),
45 | deviceSlotIndices(NULL),
46 | deviceWordTokens(NULL),
47 | deviceWordTopics(NULL),
48 | deviceWordPerplexity(NULL),
49 | deviceWordPerplexityMid(NULL),
50 | docRevIndices(NULL),
51 | docRevIdx(NULL),
52 | deviceDocRevIndices(NULL),
53 | deviceDocRevIdx(NULL)
54 | {
55 |
56 | chunkId = argChunkId;
57 | docIdStart = argDocIdStart;
58 | docIdEnd = argDocIdEnd;
59 | chunkNumDocs = docIdEnd - docIdStart;
60 | numDocs = argNumDocs;
61 | numChunks = argNumChunks;
62 | }
63 |
64 | struct pthreadArgTheta
65 | {
66 | int numWords;
67 | int docStart;
68 | int docEnd;
69 | long long *wordIndices;
70 | int *wordTokens;
71 |
72 | vector > *tmpDocPtr;
73 | };
74 |
75 | static void *ThetaDocReverse(void *arg)
76 | {
77 | pthreadArgTheta *localArg = (pthreadArgTheta*)arg;
78 |
79 | for(int wordId = 0; wordId < localArg->numWords; wordId ++){
80 |
81 | for(long long tokenId = localArg->wordIndices[wordId];
82 | tokenId < localArg->wordIndices[wordId + 1];
83 | tokenId ++){
84 |
85 | int tmpDocId = localArg->wordTokens[tokenId];
86 | if( tmpDocId >= localArg->docStart && tmpDocId < localArg->docEnd)
87 | ((*(localArg->tmpDocPtr))[tmpDocId]).push_back(tokenId);
88 | }
89 | }
90 | }
91 |
92 |
93 | class SortClass
94 | {
95 | public:
96 | int wordId;
97 | int wordLen;
98 | SortClass(int a, int b):wordId(a),wordLen(b){}
99 |
100 | friend bool operator<(const SortClass &a, const SortClass &b)
101 | {
102 | return a.wordLen > b.wordLen;
103 | }
104 | };
105 |
106 | void DocChunk::loadChunk(string wordIdxFileName, string wordDataFileName, int *docLength)
107 | {
108 |
109 | printf(" loading chunk %d ...\n", chunkId);
110 | clock_t clockStart = clock();
111 | /* load wordIndices & wordLength*/
112 | ifstream wordIdxStream(wordIdxFileName.c_str(), ios::in);
113 | if(!wordIdxStream.is_open()){
114 | cout << "File " << wordIdxFileName << " open failed" << endl;
115 | exit(0);
116 | }
117 |
118 | numWords = 0;
119 | long long wordId, endIdx;
120 | vectorwordIndicesVec;
121 | wordIndicesVec.push_back(0);
122 | while(wordIdxStream >> wordId >> endIdx)
123 | wordIndicesVec.push_back(endIdx);
124 |
125 | numWords = wordIndicesVec.size() - 1;
126 | wordIdxStream.close();
127 |
128 | vector wordLengthVec;
129 | wordIndices = new long long[numWords + 1];
130 |
131 | for(int i = 0;i < numWords; i++)
132 | wordLengthVec.push_back(wordIndicesVec[i + 1] - wordIndicesVec[i]);
133 | for(int i = 0;i < numWords + 1; i++)
134 | wordIndices[i] = wordIndicesVec[i];
135 |
136 | /* load token number */
137 | long long wordDataFileSize;
138 | ifstream wordDataStream(wordDataFileName.c_str(), ios::in|ios::ate);
139 | if(!wordDataStream.is_open()){
140 | cout << "File " << wordDataFileName << " open failed" << endl;
141 | exit(0);
142 | }
143 | wordDataFileSize = wordDataStream.tellg();
144 | chunkNumTokens = wordDataFileSize/sizeof(int);
145 | wordDataStream.close();
146 |
147 | /* sort words & slice words into slots when necessary */
148 | vector sortVec;
149 | for(int i = 0;i < numWords;i++)
150 | sortVec.push_back(SortClass(i, wordLengthVec[i]));
151 | sort(sortVec.begin(), sortVec.end());
152 |
153 | for(int i = 0;i < 10;i++)
154 | printf("i:%d, wordId:%d, len:%d\n", i, sortVec[i].wordId, sortVec[i].wordLen);
155 |
156 | int aveTokens = chunkNumTokens/NumConWorkers;
157 | printf("aveTokens:%d\n", aveTokens);
158 |
159 | vector slotToWordVec;
160 | vector slotIndicesVec;
161 | numSlots = 0;
162 | for(int i = 0; i < numWords; i ++){
163 | //printf("%d\n",i);
164 | int tmpWordId = sortVec[i].wordId;
165 | int tmpWordLen = sortVec[i].wordLen;
166 |
167 | if(tmpWordLen > 1.05*aveTokens){
168 |
169 | int tmpNumsSlices = (tmpWordLen + aveTokens - 1)/aveTokens;
170 | int tmpSliceSize = (tmpWordLen + tmpNumsSlices - 1)/tmpNumsSlices;
171 |
172 | for(int i = 0;i < tmpNumsSlices; i ++){
173 |
174 | //printf("i:%d\n", i);
175 | long long tmpStartIdx = wordIndices[tmpWordId] + tmpSliceSize*i;
176 | long long tmpEndIdx = tmpStartIdx + tmpSliceSize;
177 | if(tmpEndIdx >= wordIndices[tmpWordId + 1])
178 | tmpEndIdx = wordIndices[tmpWordId + 1];
179 |
180 | slotToWordVec.push_back(tmpWordId);
181 | slotIndicesVec.push_back(tmpStartIdx);
182 | slotIndicesVec.push_back(tmpEndIdx);
183 | numSlots ++;
184 | }
185 |
186 | }
187 | else{
188 | slotToWordVec.push_back(tmpWordId);
189 | slotIndicesVec.push_back(wordIndices[tmpWordId]);
190 | slotIndicesVec.push_back(wordIndices[tmpWordId + 1]);
191 | numSlots ++;
192 | }
193 |
194 | }
195 |
196 | for(int i = 0;i < 11;i ++)printf("%d\n",wordIndicesVec[i]);
197 | for(int i = 0;i < 10;i++)
198 | printf("i:%d, wordId:%d, len:%d, start:%lld, end:%lld\n",
199 | i, sortVec[i].wordId, sortVec[i].wordLen, wordIndices[sortVec[i].wordId] , wordIndices[sortVec[i].wordId + 1]);
200 |
201 | for(int i = 0;i < 10; i++)
202 | printf("slot:%d, wordId:%d, len:%d, start:%lld, end:%lld\n", i, slotToWordVec[i], slotIndicesVec[i*2 + 1] - slotIndicesVec[i*2], slotIndicesVec[i*2], slotIndicesVec[i*2+1]);
203 |
204 | printf("numSlots:%d\n", numSlots);
205 |
206 | slotIdToWordId = new int[numSlots];
207 | slotIndices = new long long[numSlots*2];
208 |
209 | for(int i = 0;i < numSlots; i++){
210 | slotIdToWordId[i] = slotToWordVec[i];
211 | slotIndices[i*2] = slotIndicesVec[i*2];
212 | slotIndices[i*2 + 1] = slotIndicesVec[i*2 + 1];
213 | }
214 |
215 | /* load tokens */
216 | if(wordTokens != NULL)delete []wordTokens;
217 | if(wordTopics != NULL)delete []wordTopics;
218 | wordTokens = new int[chunkNumTokens];
219 | wordTopics = new short[chunkNumTokens];
220 |
221 | wordDataStream.open(wordDataFileName.c_str(), ios::in);
222 | for(long long i = 0;i < chunkNumTokens;i++)
223 | wordDataStream.read((char*)(&(wordTokens[i])), sizeof(int));
224 |
225 | //generate doc reverse info
226 | if(docRevIndices != NULL)delete []docRevIndices;
227 | if(docRevIdx != NULL)delete []docRevIdx;
228 | docRevIndices = new long long[numDocs + 1]();
229 | docRevIdx = new TokenIdxType[chunkNumTokens]();
230 |
231 | for(int docId = 0, offset = 0; docId < numDocs; docId ++){
232 | if(docId >= docIdStart && docId < docIdEnd){
233 | docRevIndices[docId] = offset;
234 | docRevIndices[docId + 1] = offset + docLength[docId];
235 | offset += docLength[docId];
236 | }
237 | else{
238 | docRevIndices[docId] = offset;
239 | docRevIndices[docId + 1] = offset;
240 | }
241 | }
242 | vector tmpDocPtr;
243 | for(int docId = 0; docId < numDocs;docId ++)
244 | tmpDocPtr.push_back(docRevIndices[docId]);
245 |
246 | for(int wordId = 0;wordId < numWords; wordId ++){
247 | for(long long tokenId = wordIndices[wordId];
248 | tokenId < wordIndices[wordId + 1];
249 | tokenId ++){
250 |
251 | int tmpDocId = wordTokens[tokenId];
252 | docRevIdx[tmpDocPtr[tmpDocId]] = int(tokenId);
253 | tmpDocPtr[tmpDocId] ++;
254 | }
255 | }
256 | }
257 |
258 |
259 | void DocChunk::generateTopics(int k){
260 |
261 | srand (time(NULL));
262 | for(long long i = 0; i < chunkNumTokens;i++)
263 | wordTopics[i] = short(rand()%k);
264 | }
265 |
266 | struct wordStruct{
267 | int wordId;
268 | int numTokens;
269 |
270 | wordStruct(int arg1, int arg2){
271 | wordId = arg1;
272 | numTokens = arg2;
273 | }
274 | };
275 |
276 |
277 | void DocChunk::allocGPU(int GPUid)
278 | {
279 | cudaSetDevice(GPUid);
280 |
281 | if(deviceWordIndices != NULL)cudaFree(deviceWordIndices);
282 | if(deviceSlotIdToWordId != NULL)cudaFree(deviceSlotIdToWordId);
283 | if(deviceSlotIndices != NULL)cudaFree(deviceSlotIndices);
284 | if(deviceWordTokens != NULL)cudaFree(deviceWordTokens);
285 | if(deviceWordTopics != NULL)cudaFree(deviceWordTopics);
286 | if(deviceWordPerplexity != NULL)cudaFree(deviceWordPerplexity);
287 | if(deviceWordPerplexityMid != NULL)cudaFree(deviceWordPerplexityMid);
288 |
289 | if(deviceDocRevIndices != NULL)cudaFree(deviceDocRevIndices);
290 | if(deviceDocRevIdx != NULL)cudaFree(deviceDocRevIdx);
291 |
292 | cudaMalloc((void**)&deviceWordIndices, (numWords + 1)*sizeof(long long));
293 | cudaMalloc((void**)&deviceSlotIdToWordId, numSlots*sizeof(int));
294 | cudaMalloc((void**)&deviceSlotIndices, numSlots*2*sizeof(long long));
295 |
296 | cudaMalloc((void**)&deviceWordTokens, chunkNumTokens*sizeof(int));
297 | cudaMalloc((void**)&deviceWordTopics, chunkNumTokens*sizeof(short));
298 |
299 | cudaMalloc((void**)&deviceWordPerplexity, numWords*(TrainBlockSize/32)*sizeof(double));
300 | cudaMalloc((void**)&deviceWordPerplexityMid, ReduceParameter*sizeof(double));
301 |
302 | gpuErr(cudaPeekAtLastError());
303 |
304 | cudaMalloc((void**)&deviceDocRevIndices, (numDocs+1)*sizeof(long long));
305 | cudaMalloc((void**)&deviceDocRevIdx, chunkNumTokens*sizeof(TokenIdxType));
306 |
307 | long long totalByte = (numWords + 1)*sizeof(long long) +
308 | chunkNumTokens*sizeof(int) +
309 | chunkNumTokens*sizeof(short) +
310 | numWords*sizeof(double) +
311 | ReduceParameter*sizeof(double) +
312 | (numDocs + 1)*sizeof(long long) +
313 | chunkNumTokens*sizeof(TokenIdxType);
314 | printf("docChunk size:%.3f GB\n", totalByte/(1024.0*1024.0*1024.0));
315 |
316 | cudaDeviceSynchronize();
317 | gpuErr(cudaPeekAtLastError());
318 | }
319 |
320 | void DocChunk::toGPU()
321 | {
322 |
323 | //tokens
324 | cudaMemcpy(deviceWordIndices,
325 | wordIndices,
326 | sizeof(long long)*(numWords + 1),
327 | cudaMemcpyHostToDevice);
328 | cudaMemcpy(deviceSlotIdToWordId,
329 | slotIdToWordId,
330 | sizeof(int)*numSlots,
331 | cudaMemcpyHostToDevice);
332 | cudaMemcpy(deviceSlotIndices,
333 | slotIndices,
334 | sizeof(long long)*numSlots*2,
335 | cudaMemcpyHostToDevice);
336 | cudaMemcpy(deviceWordTokens,
337 | wordTokens,
338 | sizeof(int)*chunkNumTokens,
339 | cudaMemcpyHostToDevice);
340 | cudaMemcpy(deviceWordTopics,
341 | wordTopics,
342 | sizeof(short)*chunkNumTokens,
343 | cudaMemcpyHostToDevice);
344 |
345 |
346 | //doc rev data
347 | cudaMemcpy(deviceDocRevIndices,
348 | docRevIndices,
349 | sizeof(long long)*(numDocs + 1),
350 | cudaMemcpyHostToDevice);
351 | cudaMemcpy(deviceDocRevIdx,
352 | docRevIdx,
353 | sizeof(TokenIdxType)*chunkNumTokens,
354 | cudaMemcpyHostToDevice);
355 |
356 | }
357 |
358 | void DocChunk::toCPU()
359 | {
360 | printf("DocChunk::toCPU() ChunkId:%d...\n", chunkId);
361 | cudaMemcpy(wordTopics,
362 | deviceWordTopics,
363 | sizeof(short)*chunkNumTokens,
364 | cudaMemcpyDeviceToHost);
365 |
366 | printf("finished DocChunk::toCPU() ...\n");
367 | }
368 |
--------------------------------------------------------------------------------
/src_culda/model/doc_chunk.h:
--------------------------------------------------------------------------------
1 | #ifndef _DOC_CHUNK_
2 | #define _DOC_CHUNK_
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 |
13 | #include
14 |
15 | #include "culda_argument.h"
16 |
17 | using namespace std;
18 |
19 | class DocChunk
20 | {
21 | public:
22 | int chunkId;
23 | int docIdStart;
24 | int docIdEnd;
25 |
26 | int numWorkers;
27 | int numDocs;
28 | int numWords;
29 | int numSlots;
30 | int numChunks;
31 |
32 | long long chunkNumTokens;
33 | int chunkNumDocs;
34 |
35 | /* original input data */
36 | long long *wordIndices; // numberWords + 1
37 | int *slotIdToWordId; // numSlots
38 | long long *slotIndices; // numSlots*2
39 | int *wordTokens; // chunkNumTokens
40 | short *wordTopics; // chunkNumTokens
41 | double *wordPerplexity; // chunkNumTokens
42 |
43 | long long *deviceWordIndices; // numWords + 1
44 | int *deviceSlotIdToWordId; // numSlots
45 | long long *deviceSlotIndices; // numSlots*2
46 | int *deviceWordTokens; // chunkNumTokens
47 | short *deviceWordTopics; // chunkNumTokens
48 | double *deviceWordPerplexity; // chunkNumTokens
49 |
50 | double *deviceWordPerplexityMid;
51 |
52 | /* reverse doc data */
53 | long long *docRevIndices; // numDocs + 1
54 | TokenIdxType *docRevIdx; // chunkTokenSize
55 |
56 | long long *deviceDocRevIndices; // numDocs + 1
57 | TokenIdxType *deviceDocRevIdx; // chunkTokenSize
58 |
59 | DocChunk();
60 | DocChunk(int argChunkId,
61 | int argDocIdStart,
62 | int argDocIdEnd,
63 | int argNumDocs,
64 | int argNumChunks);
65 | ~DocChunk()
66 | {
67 |
68 | if(wordIndices != NULL)delete []wordIndices;
69 | if(slotIdToWordId != NULL)delete []slotIdToWordId;
70 | if(slotIndices != NULL)delete []slotIndices;
71 | if(wordTokens != NULL)delete []wordTokens;
72 | if(wordTopics != NULL)delete []wordTopics;
73 | if(wordPerplexity != NULL)delete []wordPerplexity;
74 |
75 | if(deviceWordIndices != NULL)cudaFree(deviceWordIndices);
76 | if(deviceSlotIdToWordId != NULL)cudaFree(deviceSlotIdToWordId);
77 | if(deviceSlotIndices != NULL)cudaFree(deviceSlotIndices);
78 | if(deviceWordTokens != NULL)cudaFree(deviceWordTokens);
79 | if(deviceWordTopics != NULL)cudaFree(deviceWordTokens);
80 | if(deviceWordPerplexity != NULL)cudaFree(deviceWordPerplexity);
81 | if(deviceWordPerplexityMid != NULL)cudaFree(deviceWordPerplexityMid);
82 |
83 | if(deviceDocRevIndices != NULL)cudaFree(deviceDocRevIndices);
84 | if(deviceDocRevIdx != NULL)cudaFree(deviceDocRevIdx);
85 |
86 | }
87 |
88 | void loadChunk(string, string, int*);
89 | void generateTopics(int k);
90 |
91 | void allocGPU(int);
92 | void toGPU();
93 | void toCPU();
94 |
95 |
96 | };
97 |
98 | #endif
--------------------------------------------------------------------------------
/src_culda/model/model_phi.cpp:
--------------------------------------------------------------------------------
1 |
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include // std::time
10 | #include // std::rand, std::srand
11 | #include // std::memset
12 | #include
13 |
14 | #include
15 | #include "model_phi.h"
16 | #include "vocab.h"
17 |
18 |
19 |
20 | /* Implementations of class ModelPhi */
21 |
22 | ModelPhi::ModelPhi():
23 | k(0),
24 | numGPUs(1),
25 | numDocs(0),
26 | numWords(0),
27 | numChunks(1)
28 | {
29 | for(int i = 0;i < MaxNumGPU;i++){
30 | hostPhiTopicWordShort[i] = NULL;
31 | hostPhiTopic[i] = NULL;
32 | }
33 | clearPtr();
34 | }
35 |
36 | ModelPhi::ModelPhi(
37 | int argK, int argGPUs, int argDocs, int argWords, int argChunks):
38 | k(argK),
39 | numGPUs(argGPUs),
40 | numDocs(argDocs),
41 | numWords(argWords),
42 | numChunks(argChunks)
43 | {
44 | for(int i = 0;i < MaxNumGPU;i++){
45 | hostPhiTopicWordShort[i] = NULL;
46 | hostPhiTopic[i] = NULL;
47 | }
48 | clearPtr();
49 | }
50 |
51 | void ModelPhi::InitData(Document &doc)
52 | {
53 |
54 | clearPtr();
55 | if(k <= 0 || numDocs <= 0 || numWords <= 0)return;
56 |
57 | //prepare data space for phi.
58 | for(int chunkId = 0; chunkId < numChunks; chunkId ++){
59 | hostPhiTopicWordShort[chunkId] = new PHITYPE[k*numWords]();
60 | hostPhiTopic[chunkId] = new int[k]();
61 | }
62 |
63 | for(int GPUid = 0; GPUid < numGPUs; GPUid ++){
64 | ModelPhiGPU *tmpPtr = new ModelPhiGPU(k, numGPUs, GPUid, numDocs, numWords);
65 | tmpPtr->allocGPU();
66 | phiChunkVec.push_back(tmpPtr);
67 | }
68 | }
69 |
70 | void ModelPhi::UpdatePhiGPU(Document &doc, int chunkId, cudaStream_t stream)
71 | {
72 | phiChunkVec[chunkId]->UpdatePhiGPU(doc, chunkId, stream);
73 | }
74 |
75 | void ModelPhi::UpdatePhiHead(float beta, cudaStream_t *stream)
76 | {
77 |
78 | if(stream != NULL){
79 | for(int i = 0;i < numGPUs; i++)
80 | phiChunkVec[i]->UpdatePhiHead(beta, stream[i]);
81 | }
82 | else{
83 | for(int i = 0;i < numGPUs; i++)
84 | phiChunkVec[i]->UpdatePhiHead(beta);
85 | }
86 | }
87 |
88 | void ModelPhi::MasterGPUToCPU(cudaStream_t stream)
89 | {
90 |
91 | //phi
92 | cudaMemcpyAsync(hostPhiTopicWordShort[0],
93 | phiChunkVec[0]->devicePhiTopicWordShort,
94 | sizeof(PHITYPE)*k*numWords,
95 | cudaMemcpyDeviceToHost,
96 | stream);
97 |
98 | cudaMemcpyAsync(hostPhiTopic[0],
99 | phiChunkVec[0]->devicePhiTopic,
100 | sizeof(int)*k,
101 | cudaMemcpyDeviceToHost,
102 | stream);
103 | }
104 |
105 | void ModelPhi::MasterGPUCollect(int GPUid, cudaStream_t stream)
106 | {
107 | cudaMemcpyAsync(phiChunkVec[0]->devicePhiTopicWordShortCopy,
108 | phiChunkVec[GPUid]->devicePhiTopicWordShort,
109 | sizeof(PHITYPE)*k*numWords,
110 | cudaMemcpyDeviceToDevice,
111 | stream);
112 | cudaMemcpyAsync(phiChunkVec[0]->devicePhiTopicCopy,
113 | phiChunkVec[GPUid]->devicePhiTopic,
114 | sizeof(int)*k,
115 | cudaMemcpyDeviceToDevice,
116 | stream);
117 | }
118 |
119 | void ModelPhi::MasterGPUDistribute(int GPUid, cudaStream_t stream)
120 | {
121 | cudaMemcpyAsync(phiChunkVec[GPUid]->devicePhiTopicWordShort,
122 | phiChunkVec[0]->devicePhiTopicWordShort,
123 | sizeof(PHITYPE)*k*numWords,
124 | cudaMemcpyDeviceToDevice,
125 | stream);
126 | cudaMemcpyAsync(phiChunkVec[GPUid]->devicePhiTopic,
127 | phiChunkVec[0]->devicePhiTopic,
128 | sizeof(int)*k,
129 | cudaMemcpyDeviceToDevice,
130 | stream);
131 | }
132 |
133 | void ModelPhi::MasterGPUReduce(cudaStream_t stream)
134 | {
135 | cudaSetDevice(0);
136 | LDAUpdatePhiReduceAPI(
137 | k,
138 | numWords,
139 | phiChunkVec[0]->devicePhiTopicWordShort,
140 | phiChunkVec[0]->devicePhiTopicWordShortCopy,
141 | phiChunkVec[0]->devicePhiTopic,
142 | phiChunkVec[0]->devicePhiTopicCopy,
143 | stream);
144 | }
145 |
146 | void ModelPhi::clearPtr()
147 | {
148 | for(int i = 0;i < phiChunkVec.size(); i++)
149 | if(phiChunkVec[i] != NULL)delete phiChunkVec[i];
150 |
151 | //CPU data release
152 | for(int i = 0;i < numChunks;i++){
153 | if(hostPhiTopicWordShort[i] != NULL) delete hostPhiTopicWordShort[i];
154 | if(hostPhiTopic[i] != NULL) delete hostPhiTopic[i];
155 | }
156 | //printf("ModelPhi::clearPtr() finished\n");
157 | }
158 |
159 |
160 |
161 | void ModelPhi::savePhi(string fileName)
162 | {
163 | printf("Saving phi ...\n");
164 |
165 | ofstream phiStream(fileName.c_str(), ios::out);
166 |
167 | int tmpNumWords = numWords;
168 | tmpNumWords = 1000;
169 | for(int wordId = 0;wordId < tmpNumWords; wordId++){
170 | phiStream << "Word id:" << wordId << ", ";
171 |
172 | for(int kite = 0; kite < k;kite++ ){
173 | if(kite%32 == 0){
174 | phiStream << endl;
175 | phiStream.width(2);
176 | phiStream << kite/32;
177 | phiStream.width(0);
178 | phiStream << ":";
179 | }
180 |
181 | int tmpVal = hostPhiTopicWordShort[0][wordId*k + kite];
182 | if(tmpVal == 0)
183 | phiStream << "_,";
184 | else
185 | phiStream << hostPhiTopicWordShort[0][wordId*k + kite] << ",";
186 | }
187 | phiStream << endl;
188 | }
189 | }
190 |
191 |
192 |
193 |
194 | void ModelPhi::validPhi(Document&doc)
195 | {
196 |
197 | printf("Calling validPhi() ...");
198 | int tmpPhi[1024];
199 | int tmpPhiTopic[1024];
200 |
201 | clock_t clockStart = clock();
202 | int maxPhi = 0;
203 | // validate hostPhiTopicWord
204 | for(int wordId = 0; wordId < numWords; wordId ++){
205 |
206 | for(int i = 0;i < 1024;i++)tmpPhi[i] = 0;
207 |
208 | //add
209 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++){
210 | for(long long tokenId = doc.docChunkVec[chunkId]->wordIndices[wordId];
211 | tokenId < doc.docChunkVec[chunkId]->wordIndices[wordId + 1];
212 | tokenId ++){
213 |
214 | tmpPhi[doc.docChunkVec[chunkId]->wordTopics[tokenId]] ++;
215 | }
216 | }
217 |
218 | //validate hostPhiTopicWord
219 |
220 | for(int i = 0;i < 1024; i++){
221 | if(hostPhiTopicWordShort[0][wordId*k + i] > maxPhi) maxPhi = hostPhiTopicWordShort[0][wordId*k + i];
222 |
223 | if(tmpPhi[i] != hostPhiTopicWordShort[0][wordId*k + i]){
224 |
225 | printf("ValidPhi Error: wordId(%d), topic(%d), tmpphi[i](%d), phi(%d)\n",
226 | wordId, i, tmpPhi[i], hostPhiTopicWordShort[0][wordId*k + i]);
227 |
228 | printf("topic:%d, level1:%d, level2:%d\n", i, i/32, i%32);
229 |
230 | for(int j = 0;j < 32;j ++){
231 |
232 | printf("tmpphi:\n");
233 | printf("%2d:",j);
234 | for(int m = 0;m < 32;m ++){
235 | printf("%d,",tmpPhi[j*32 + m]);
236 | }
237 | printf("\n");
238 |
239 | printf("phi :\n");
240 | printf("%2d:",j);
241 | for(int m = 0;m < 32;m ++){
242 | printf("%d,",hostPhiTopicWordShort[0][wordId*k + j*32 + m]);
243 | }
244 | printf("\n");
245 |
246 | }
247 | exit(0);
248 | }
249 | }
250 | }
251 |
252 | //validate hostPhiTopic
253 | for(int i = 0;i < 1024; i++)tmpPhiTopic[i] = 0;
254 | //Step 1: add
255 |
256 | for(int wordId = 0; wordId < numWords; wordId ++){
257 |
258 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++){
259 | for(long long tokenId = doc.docChunkVec[chunkId]->wordIndices[wordId];
260 | tokenId < doc.docChunkVec[chunkId]->wordIndices[wordId + 1];
261 | tokenId ++){
262 |
263 | tmpPhiTopic[doc.docChunkVec[chunkId]->wordTopics[tokenId]] ++;
264 | }
265 | }
266 | }
267 |
268 | //Step 2: validate
269 | for(int i = 0;i < 1024;i++){
270 | if(tmpPhiTopic[i] != hostPhiTopic[0][i]){
271 | printf("ValidPhi Error 2: topic(%d), tmpPhiTopic(%d), hostPhiTopic[i](%d)\n",
272 | i, tmpPhiTopic[i], hostPhiTopic[i]);
273 | exit(0);
274 | }
275 | }
276 |
277 | printf("Validate Phi passed ...\n");
278 | printf("ValidatePhi time:%.2fs\n", (clock() - clockStart)/(double)CLOCKS_PER_SEC);
279 | printf("max phi:%d\n", maxPhi);
280 | }
281 |
282 |
283 | /*
284 | class FreqTuple{
285 |
286 | public:
287 | int id;
288 | int count;
289 |
290 | FreqTuple(int arg1, int arg2):id(arg1),count(arg2){}
291 | friend bool operator<(const FreqTuple &left ,const FreqTuple &right){return left.count > right.count;}
292 | };
293 |
294 | void ModelPhi::saveWord(std::string wordFileName, Vocabulary &vocab)
295 | {
296 |
297 | printf("Saving word model ...\n");
298 | ofstream wordFileStream(wordFileName.c_str(), ios::out);
299 |
300 | wordFileStream << "topic, total, top words" << endl;
301 | for(int kite = 0; kite < k; kite ++){
302 |
303 | int totalCount = 0;
304 | vector wordVec;
305 |
306 | for(int wordId = 0; wordId < numWords; wordId ++){
307 |
308 | int tmpCount = hostPhiTopicWord[wordId*k + kite];
309 | if(tmpCount <= 0)continue;
310 |
311 | totalCount += tmpCount;
312 | wordVec.push_back(FreqTuple(wordId, tmpCount));
313 | }
314 |
315 | sort(wordVec.begin(), wordVec.end());
316 |
317 | //output
318 | if(totalCount <= 0)continue;
319 |
320 | wordFileStream.width(4);
321 | wordFileStream << kite << " " << totalCount << " ";
322 | for(int i = 0;i < wordVec.size(); i ++){
323 | wordFileStream << "(" << vocab.getWordById(wordVec[i].id) << ",";
324 | wordFileStream << wordVec[i].count << ") ";
325 | }
326 | wordFileStream << endl;
327 | }
328 | }
329 | */
330 |
331 |
332 | struct pthreadArgShort
333 | {
334 | PHITYPE *matrixA;
335 | PHITYPE *matrixB;
336 | int idxStart;
337 | int idxEnd;
338 | int matrixSize;
339 | };
340 |
341 | struct pthreadArgInt
342 | {
343 | int *matrixA;
344 | int *matrixB;
345 | int idxStart;
346 | int idxEnd;
347 | int matrixSize;
348 | };
349 |
350 |
351 |
352 | static void *PhiReduceThreadShort(void *arg)
353 | {
354 | pthreadArgShort *localArg = (pthreadArgShort*)arg;
355 |
356 | int startIdx = localArg->idxStart;
357 | int endIdx = localArg->idxEnd;
358 | if(endIdx >= localArg->matrixSize)
359 | endIdx = localArg->matrixSize;
360 |
361 | PHITYPE *matrixA = localArg->matrixA;
362 | PHITYPE *matrixB = localArg->matrixB;
363 | for(int i = startIdx; i < endIdx; i ++)
364 | matrixA[i] += matrixB[i];
365 | }
366 |
367 | static void *PhiReduceThreadInt(void *arg)
368 | {
369 | pthreadArgInt *localArg = (pthreadArgInt*)arg;
370 |
371 | int startIdx = localArg->idxStart;
372 | int endIdx = localArg->idxEnd;
373 | if(endIdx >= localArg->matrixSize)
374 | endIdx = localArg->matrixSize;
375 |
376 | int *matrixA = localArg->matrixA;
377 | int *matrixB = localArg->matrixB;
378 | for(int i = startIdx; i < endIdx; i ++)
379 | matrixA[i] += matrixB[i];
380 | }
381 |
382 | static void PhiReduceShort(PHITYPE *matrixA, PHITYPE *matrixB, int matrixSize)
383 | {
384 |
385 |
386 | struct timespec begin, end;
387 | double elapsed;
388 | clock_gettime(CLOCK_MONOTONIC, &begin);
389 |
390 | const int numThreads = 48;
391 | pthread_t threads[numThreads];
392 | pthreadArgShort threadArgs[numThreads];
393 | int perThreadSize = (matrixSize + numThreads - 1)/numThreads;
394 |
395 | //launch
396 | for(int threadId = 0; threadId < numThreads; threadId ++){
397 | threadArgs[threadId].matrixA = matrixA;
398 | threadArgs[threadId].matrixB = matrixB;
399 | threadArgs[threadId].idxStart = perThreadSize*threadId;
400 | threadArgs[threadId].idxEnd = perThreadSize*threadId + perThreadSize;
401 | threadArgs[threadId].matrixSize = matrixSize;
402 |
403 | pthread_create(&(threads[threadId]),
404 | NULL,
405 | PhiReduceThreadShort,
406 | (void*)(&(threadArgs[threadId])));
407 | }
408 |
409 | //join
410 | for(int threadId = 0; threadId < numThreads; threadId ++)
411 | pthread_join(threads[threadId], NULL);
412 |
413 | clock_gettime(CLOCK_MONOTONIC, &end);
414 |
415 | elapsed = end.tv_sec - begin.tv_sec;
416 | elapsed += (end.tv_nsec - begin.tv_nsec) / 1000000000.0;
417 | printf("PhiReduceShort: %.8lfs\n",elapsed);
418 |
419 | }
420 |
421 | static void PhiReduceInt(int *matrixA, int *matrixB, int matrixSize)
422 | {
423 |
424 |
425 | struct timespec begin, end;
426 | double elapsed;
427 | clock_gettime(CLOCK_MONOTONIC, &begin);
428 |
429 | const int numThreads = 16;
430 | pthread_t threads[numThreads];
431 | pthreadArgInt threadArgs[numThreads];
432 | int perThreadSize = (matrixSize + numThreads - 1)/numThreads;
433 |
434 | //launch
435 | for(int threadId = 0; threadId < numThreads; threadId ++){
436 | threadArgs[threadId].matrixA = matrixA;
437 | threadArgs[threadId].matrixB = matrixB;
438 | threadArgs[threadId].idxStart = perThreadSize*threadId;
439 | threadArgs[threadId].idxEnd = perThreadSize*threadId + perThreadSize;
440 | threadArgs[threadId].matrixSize = matrixSize;
441 |
442 | pthread_create(&(threads[threadId]),
443 | NULL,
444 | PhiReduceThreadInt,
445 | (void*)(&(threadArgs[threadId])));
446 | }
447 |
448 | //join
449 | for(int threadId = 0; threadId < numThreads; threadId ++)
450 | pthread_join(threads[threadId], NULL);
451 |
452 | clock_gettime(CLOCK_MONOTONIC, &end);
453 |
454 | elapsed = end.tv_sec - begin.tv_sec;
455 | elapsed += (end.tv_nsec - begin.tv_nsec) / 1000000000.0;
456 | printf("PhiReduceInt: %.8lfs\n",elapsed);
457 |
458 | }
459 |
460 | void ModelPhi::reduceCPU()
461 | {
462 | for(int i = 1;i < numChunks;i++){
463 | PhiReduceShort(hostPhiTopicWordShort[0], hostPhiTopicWordShort[i], numWords*k);
464 | PhiReduceInt(hostPhiTopic[0], hostPhiTopic[i], k);
465 | }
466 | }
467 |
468 |
469 |
470 |
471 |
--------------------------------------------------------------------------------
/src_culda/model/model_phi.h:
--------------------------------------------------------------------------------
1 | #ifndef _MODEL_PHI_H_
2 |
3 | #define _MODEL_PHI_H_
4 |
5 |
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | #include
12 | #include "culda_argument.h"
13 | #include "vocab.h"
14 | #include "doc.h"
15 |
16 | #include "../kernel/lda_train_kernel.h"
17 | #include "../kernel/lda_phi_kernel.h"
18 | #include "model_phi_gpu.h"
19 |
20 | using namespace std;
21 |
22 | class ModelPhi
23 | {
24 | public:
25 | int k;
26 | int numGPUs;
27 | int numDocs;
28 | int numWords;
29 | int numChunks;
30 |
31 | //cpu data
32 | PHITYPE *hostPhiTopicWordShort[MaxNumGPU];
33 | int *hostPhiTopic[MaxNumGPU];
34 |
35 | vector phiChunkVec;
36 |
37 | ModelPhi();
38 | ModelPhi(int argK, int argGPU, int argDocs, int argWords, int numChunks);
39 |
40 | void InitData(Document&);
41 | void UpdatePhiGPU(Document&, int chunkId, cudaStream_t s=0);
42 | void UpdatePhiHead(float beta,cudaStream_t *stream=NULL);
43 |
44 | void clearPtr();
45 |
46 | ~ModelPhi(){ clearPtr();}
47 |
48 | void MasterGPUCollect(int GPUid, cudaStream_t stream=0);
49 | void MasterGPUDistribute(int GPUid, cudaStream_t stream=0);
50 | void MasterGPUToCPU(cudaStream_t stream=0);
51 | void MasterGPUReduce(cudaStream_t stream=0);
52 |
53 | void reduceCPU();
54 |
55 | void validPhi(Document&);
56 | void savePhi(string fileName);
57 | };
58 |
59 |
60 | #endif
--------------------------------------------------------------------------------
/src_culda/model/model_phi_gpu.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include // std::time
8 | #include // std::rand, std::srand
9 | #include // std::memset
10 | #include
11 |
12 | #include
13 | #include "model_phi_gpu.h"
14 | #include "vocab.h"
15 |
16 |
17 |
18 | ModelPhiGPU::ModelPhiGPU():
19 | k(0),
20 | numGPUs(1),
21 | GPUid(0),
22 | numDocs(0),
23 | numWords(0),
24 | devicePhiTopicWordShort(NULL),
25 | devicePhiTopicWordSub(NULL),
26 | devicePhiTopic(NULL),
27 | devicePhiHead(NULL),
28 | devicePhiTopicWordShortCopy(NULL),
29 | devicePhiTopicCopy(NULL)
30 | {
31 | }
32 |
33 | ModelPhiGPU::ModelPhiGPU(
34 | int argk,
35 | int argNumGPUs,
36 | int argid,
37 | int argdoc,
38 | int argword):
39 | k(argk),
40 | numGPUs(argNumGPUs),
41 | GPUid(argid),
42 | numDocs(argdoc),
43 | numWords(argword),
44 | devicePhiTopicWordShort(NULL),
45 | devicePhiTopicWordSub(NULL),
46 | devicePhiTopic(NULL),
47 | devicePhiHead(NULL),
48 | devicePhiTopicWordShortCopy(NULL),
49 | devicePhiTopicCopy(NULL)
50 | {
51 | }
52 |
53 | void ModelPhiGPU::allocGPU()
54 | {
55 | cudaSetDevice(GPUid);
56 | cudaMalloc((void**)&devicePhiTopicWordShort, sizeof(PHITYPE)*k*numWords);
57 | cudaMalloc((void**)&devicePhiTopicWordSub, sizeof(int)*k*UpdateNumWorkers);
58 | cudaMalloc((void**)&devicePhiTopic, sizeof(int)*k);
59 | cudaMalloc((void**)&devicePhiHead, sizeof(half)*k*numWords);
60 |
61 | if(GPUid == 0 && numGPUs > 1){
62 | cudaMalloc((void**)&devicePhiTopicWordShortCopy, sizeof(PHITYPE)*k*numWords);
63 | cudaMalloc((void**)&devicePhiTopicCopy, sizeof(int)*k);
64 | }
65 | long long totalByte = sizeof(PHITYPE)*k*numWords +
66 | sizeof(int)*k*UpdateNumWorkers +
67 | sizeof(int)*k +
68 | sizeof(half)*k*numWords;
69 |
70 | printf("phi sizeof:%.3f GB\n", totalByte/(1024.0*1024.0*1024.0));
71 |
72 | cudaDeviceSynchronize();
73 | gpuErr(cudaPeekAtLastError());
74 | }
75 |
76 | void ModelPhiGPU::UpdatePhiGPU(Document &doc, int chunkId, cudaStream_t stream)
77 | {
78 | cudaSetDevice(GPUid);
79 | cudaMemsetAsync(devicePhiTopic, 0, k*sizeof(int), stream);
80 |
81 | LDAUpdatePhiAPI(
82 | k,
83 | numWords,
84 | doc.docChunkVec[chunkId]->deviceWordIndices,
85 | doc.docChunkVec[chunkId]->deviceWordTopics,
86 | devicePhiTopicWordShort,
87 | devicePhiTopicWordSub,
88 | devicePhiTopic,
89 | stream
90 | );
91 | }
92 |
93 | void ModelPhiGPU::UpdatePhiHead(float beta, cudaStream_t stream)
94 | {
95 |
96 | cudaSetDevice(GPUid);
97 | //printf("ModelPhiGPU::UpdatePhiHead() ... id:%d\n", GPUid);
98 | LDAComputePhiHeadAPI(
99 | k,
100 | beta,
101 | numWords,
102 | devicePhiTopicWordShort,
103 | devicePhiTopic,
104 | devicePhiHead,
105 | stream);
106 | }
107 |
108 | void ModelPhiGPU::clearPtr()
109 | {
110 |
111 | if(devicePhiTopicWordShort != NULL) cudaFree(devicePhiTopicWordShort);
112 | if(devicePhiTopicWordSub != NULL) cudaFree(devicePhiTopicWordSub);
113 | if(devicePhiTopic != NULL) cudaFree(devicePhiTopic);
114 | if(devicePhiHead != NULL) cudaFree(devicePhiHead);
115 |
116 | devicePhiTopicWordSub = NULL;
117 | devicePhiTopicWordShort = NULL;
118 | devicePhiTopic = NULL;
119 | devicePhiHead = NULL;
120 | }
121 |
122 |
123 |
124 |
--------------------------------------------------------------------------------
/src_culda/model/model_phi_gpu.h:
--------------------------------------------------------------------------------
1 | #ifndef _MODEL_PHI_GPU_H_
2 | #define _MODEL_PHI_GPU_H_
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #include
10 | #include "culda_argument.h"
11 | #include "vocab.h"
12 | #include "doc.h"
13 |
14 | #include "../kernel/lda_train_kernel.h"
15 | #include "../kernel/lda_phi_kernel.h"
16 |
17 | class ModelPhiGPU
18 | {
19 | public:
20 | int k;
21 | int numGPUs;
22 | int GPUid;
23 | int numDocs;
24 | int numWords;
25 |
26 |
27 | PHITYPE *devicePhiTopicWordShort;
28 | int *devicePhiTopicWordSub;
29 | int *devicePhiTopic;
30 | half *devicePhiHead;
31 |
32 | PHITYPE *devicePhiTopicWordShortCopy;
33 | int *devicePhiTopicCopy;
34 |
35 | ModelPhiGPU();
36 | ModelPhiGPU(int, int, int, int, int);
37 | ~ModelPhiGPU(){clearPtr();}
38 |
39 | void allocGPU();
40 | void UpdatePhiGPU(Document &, int, cudaStream_t stream=0);
41 | void UpdatePhiHead(float, cudaStream_t stream=0);
42 |
43 | void clearPtr();
44 | };
45 |
46 | #endif
--------------------------------------------------------------------------------
/src_culda/model/model_theta.cpp:
--------------------------------------------------------------------------------
1 |
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include // std::time
10 | #include // std::rand, std::srand
11 | #include // std::memset
12 | #include
13 |
14 | #include
15 | #include "model_theta.h"
16 | #include "vocab.h"
17 |
18 |
19 |
20 | /* Implementations of class ModelTheta */
21 |
22 | ModelTheta::ModelTheta():
23 | k(0),
24 | numDocs(0),
25 | numWords(0),
26 | numChunks(0),
27 | thetaNNZ(0)
28 | {
29 | clearPtr();
30 | }
31 |
32 | ModelTheta::ModelTheta(int argK,int argDocs, int argWords, int argNumChunks):
33 | k(argK),
34 | numDocs(argDocs),
35 | numWords(argWords),
36 | numChunks(argNumChunks),
37 | thetaNNZ(0)
38 | {
39 | clearPtr();
40 | }
41 |
42 |
43 |
44 | void ModelTheta::InitData(Document &doc)
45 | {
46 |
47 | clearPtr();
48 |
49 | vector docLenVec;
50 | for(int docId = 0; docId < numDocs; docId ++){
51 | int tmpLen = doc.docLength[docId];
52 | if(tmpLen >= k)tmpLen = k;
53 |
54 | tmpLen = ((tmpLen + 31)/32)*32;
55 | docLenVec.push_back(tmpLen);
56 | }
57 |
58 | //chunk by chunk
59 | for(int chunkId = 0; chunkId < numChunks; chunkId ++){
60 | int tmpChunkNNZ = 0;
61 |
62 | for(int docId = doc.docChunkVec[chunkId]->docIdStart;
63 | docId < doc.docChunkVec[chunkId]->docIdEnd;
64 | docId ++)
65 | tmpChunkNNZ += docLenVec[docId];
66 |
67 | ModelThetaChunk *tmpPtr = new ModelThetaChunk(
68 | k,
69 | doc.numDocs,
70 | doc.numWords,
71 | doc.numChunks,
72 | chunkId,
73 | doc.docChunkVec[chunkId]->docIdStart,
74 | doc.docChunkVec[chunkId]->docIdEnd,
75 | doc.docChunkVec[chunkId]->docIdEnd - doc.docChunkVec[chunkId]->docIdStart,
76 | tmpChunkNNZ);
77 |
78 | //printf("chunkId:%d, tmpChunkNNZ:%d\n", chunkId, tmpChunkNNZ);
79 |
80 | tmpPtr->InitData(docLenVec);
81 | thetaChunkVec.push_back(tmpPtr);
82 | }
83 |
84 | cudaDeviceSynchronize();
85 | gpuErr(cudaPeekAtLastError());
86 | }
87 |
88 | void ModelTheta::UpdateThetaGPU(Document &doc, cudaStream_t *stream)
89 | {
90 | cudaDeviceSynchronize();
91 | gpuErr(cudaPeekAtLastError());
92 |
93 | if(stream != NULL){
94 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++)
95 | thetaChunkVec[chunkId]->UpdateThetaGPU(doc, stream[chunkId]);
96 | }
97 | else{
98 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++)
99 | thetaChunkVec[chunkId]->UpdateThetaGPU(doc);
100 | }
101 | cudaDeviceSynchronize();
102 | gpuErr(cudaPeekAtLastError());
103 | }
104 |
105 | void ModelTheta::toGPU()
106 | {
107 | for(int chunkId = 0; chunkId < numChunks; chunkId ++)
108 | thetaChunkVec[chunkId]->toGPU();
109 | }
110 |
111 | void ModelTheta::toCPU()
112 | {
113 | for(int chunkId = 0; chunkId < numChunks; chunkId ++)
114 | thetaChunkVec[chunkId]->toCPU();
115 |
116 | }
117 |
118 | void ModelTheta::clearPtr()
119 | {
120 | for(int i = 0;i < thetaChunkVec.size(); i++)
121 | delete thetaChunkVec[i];
122 | }
123 |
124 |
125 | /*
126 | float ModelTheta::countZero()
127 | {
128 | toCPU();
129 | long long totalEntry = 0;
130 | long long zeroEntry = 0;
131 | for(int i = 0;i < numDocs;i++){
132 | for(int j = hostThetaMaxIA[i];j < hostThetaMaxIA[i+1];j++){
133 | totalEntry++;
134 | if(hostThetaA[j] == 0)zeroEntry ++;
135 | }
136 | }
137 | return 1.0*zeroEntry/totalEntry;
138 | }
139 | */
140 |
141 | /*
142 | float ModelTheta::countIA()
143 | {
144 | toCPU();
145 |
146 | long long total = 0;
147 | long long cur = 0;
148 | for(int i = 0;i < numDocs;i++){
149 | int startIdx = hostThetaMaxIA[i];
150 | int maxEndIdx = hostThetaMaxIA[i+1];
151 | int curEndIdx = hostThetaCurIA[i];
152 | total += maxEndIdx - startIdx;
153 | cur += curEndIdx - startIdx;
154 | }
155 |
156 | return 1.0*cur/total;
157 | }
158 | */
159 |
160 |
161 | /*
162 | void ModelTheta::saveTheta(string fileName)
163 | {
164 |
165 | printf("Saving theta ...\n");
166 |
167 | ofstream thetaStream(fileName.c_str(), ios::out);
168 | int tmpNumDocs = numDocs;
169 | tmpNumDocs = 1000;
170 | for(int i = 0;i < tmpNumDocs;i++){
171 | thetaStream << "Doc id:" << i << ", ";
172 | thetaStream << "len:" << hostThetaMaxIA[i+1] - hostThetaMaxIA[i] << ", ";
173 | thetaStream << "MaxIA:" << hostThetaMaxIA[i] << " - " << hostThetaMaxIA[i + 1] << ", ";
174 | thetaStream << "CurIA:" << hostThetaCurIA[i];
175 | thetaStream << endl;
176 |
177 | int totalA = 0;
178 | for(int j = hostThetaMaxIA[i]; j < hostThetaMaxIA[i+1] ; j++){
179 |
180 | thetaStream << "IA(" << j << "), "
181 | << "JA(" << hostThetaJA[j] << "), "
182 | << "A(" << hostThetaA[j] << ")\n";
183 | totalA += hostThetaA[j];
184 | }
185 | thetaStream << "total: " << totalA << "\n";
186 | }
187 | }
188 | */
189 |
190 |
191 | void ModelTheta::validTheta(Document &doc)
192 | {
193 | for(int chunkId = 0; chunkId < numChunks; chunkId ++)
194 | thetaChunkVec[chunkId]->validTheta(doc);
195 | }
196 |
197 | class FreqTuple{
198 |
199 | public:
200 | int id;
201 | int count;
202 |
203 | FreqTuple(int arg1, int arg2):id(arg1),count(arg2){}
204 | friend bool operator<(const FreqTuple &left ,const FreqTuple &right){return left.count > right.count;}
205 | };
206 |
207 | /*
208 | void ModelTheta::saveDoc(std::string docFileName)
209 | {
210 | ofstream docFileStream(docFileName.c_str(), ios::out);
211 |
212 | printf("Saving doc model ...\n");
213 | docFileStream << "Doc, total, top topics" << endl;
214 |
215 | for(int docId = 0; docId < numDocs; docId ++){
216 |
217 | int totalCount = 0;
218 | vector topicVec;
219 | for(int tmpIdx = hostThetaMaxIA[docId]; tmpIdx < hostThetaMaxIA[docId + 1]; tmpIdx ++){
220 |
221 | int tmpCount = hostThetaA[tmpIdx];
222 | int tmpK = hostThetaJA[tmpIdx];
223 | if(tmpCount == 0)continue;
224 |
225 | totalCount += tmpCount;
226 | topicVec.push_back(FreqTuple(tmpK, tmpCount));
227 | }
228 |
229 | sort(topicVec.begin(), topicVec.end());
230 |
231 | //output
232 | docFileStream.width(4);
233 | docFileStream << docId << "| " << totalCount << " ";
234 | for(int i = 0;i < topicVec.size();i++){
235 | docFileStream << "(" << topicVec[i].id << ",";
236 | docFileStream << topicVec[i].count << "), ";
237 | }
238 | docFileStream << endl;
239 | }
240 |
241 | }
242 | */
243 |
244 |
245 |
246 |
247 |
248 |
--------------------------------------------------------------------------------
/src_culda/model/model_theta.h:
--------------------------------------------------------------------------------
1 | #ifndef _GPU_THETA_H_
2 |
3 | #define _GPU_THETA_H_
4 |
5 |
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | #include
12 | #include "culda_argument.h"
13 | #include "vocab.h"
14 | #include "doc.h"
15 |
16 | #include "../kernel/lda_train_kernel.h"
17 | #include "../kernel/lda_theta_kernel.h"
18 | #include "model_theta_chunk.h"
19 |
20 | using namespace std;
21 |
22 | class ModelTheta
23 | {
24 | public:
25 | int k;
26 | int numDocs;
27 | int numWords;
28 | int numChunks;
29 |
30 | int thetaNNZ;
31 |
32 | vector thetaChunkVec;
33 |
34 | ModelTheta();
35 | ModelTheta(int argK, int argDocs, int argWords, int argNumChunks);
36 |
37 | void InitData(Document&);
38 | void validTheta(Document&);
39 | void UpdateThetaGPU(Document &doc, cudaStream_t *stream=NULL);
40 | void clearPtr();
41 |
42 | ~ModelTheta(){ clearPtr();}
43 |
44 | void toGPU();
45 | void toCPU();
46 |
47 | //float countZero();
48 | //float countIA();
49 |
50 | //void saveTheta(string fileName);
51 | //void saveDoc(std::string docFileName);
52 |
53 | };
54 |
55 |
56 | #endif
--------------------------------------------------------------------------------
/src_culda/model/model_theta_chunk.cpp:
--------------------------------------------------------------------------------
1 | /* Implementations of class ModelThetaChunk */
2 |
3 | #include "model_theta_chunk.h"
4 |
5 | ModelThetaChunk::ModelThetaChunk():
6 | k(0),
7 | numDocs(0),
8 | numWords(0),
9 | numChunks(0),
10 | chunkId(0),
11 | docIdStart(0),
12 | docIdEnd(0),
13 | chunkNumDocs(0),
14 | chunkNNZ(0),
15 | deviceThetaA(NULL),
16 | deviceThetaJA(NULL),
17 | deviceThetaCurIA(NULL),
18 | deviceThetaMaxIA(NULL),
19 | deviceDenseTheta(NULL),
20 | hostThetaA(NULL),
21 | hostThetaJA(NULL),
22 | hostThetaCurIA(NULL),
23 | hostThetaMaxIA(NULL)
24 | {
25 | clearPtr();
26 | }
27 |
28 | ModelThetaChunk::ModelThetaChunk(
29 | int argK,
30 | int argNumDocs,
31 | int argNumWords,
32 | int argNumChunks,
33 | int argChunkId,
34 | int argDocIdStart,
35 | int argDocIdEnd,
36 | int argChunkNumDocs,
37 | int argChunkNNZ):
38 | k(argK),
39 | numDocs(argNumDocs),
40 | numWords(argNumWords),
41 | numChunks(argNumChunks),
42 | chunkId(argChunkId),
43 | docIdStart(argDocIdStart),
44 | docIdEnd(argDocIdEnd),
45 | chunkNumDocs(argChunkNumDocs),
46 | chunkNNZ(argChunkNNZ),
47 | deviceThetaA(NULL),
48 | deviceThetaJA(NULL),
49 | deviceThetaCurIA(NULL),
50 | deviceThetaMaxIA(NULL),
51 | deviceDenseTheta(NULL),
52 | hostThetaA(NULL),
53 | hostThetaJA(NULL),
54 | hostThetaCurIA(NULL),
55 | hostThetaMaxIA(NULL)
56 | {
57 | clearPtr();
58 | }
59 |
60 |
61 |
62 | void ModelThetaChunk::InitData(const vector &docLenVec)
63 | {
64 | //alloc space
65 | hostThetaA = new short[chunkNNZ];
66 | hostThetaJA = new short[chunkNNZ];
67 | hostThetaCurIA = new int[numDocs];
68 | hostThetaMaxIA = new int[numDocs + 1];
69 |
70 | //CPU side
71 | memset(hostThetaMaxIA, 0, sizeof(int)*(numDocs + 1));
72 |
73 | int offset = 0;
74 | for(int docId = docIdStart; docId < docIdEnd; docId ++){
75 | hostThetaMaxIA[docId] = offset;
76 | hostThetaMaxIA[docId + 1] = offset + docLenVec[docId];
77 | offset += docLenVec[docId];
78 | }
79 | for(int docId = docIdEnd; docId <= numDocs ;docId ++)
80 | hostThetaMaxIA[docId] = offset;
81 |
82 | cudaSetDevice(chunkId);
83 | //GPU side
84 | cudaMalloc((void**)&deviceThetaA, sizeof(short)*chunkNNZ);
85 | cudaMalloc((void**)&deviceThetaJA, sizeof(short)*chunkNNZ);
86 | cudaMalloc((void**)&deviceThetaMaxIA, sizeof(int)*(numDocs + 1));
87 | cudaMalloc((void**)&deviceThetaCurIA, sizeof(int)*numDocs);
88 | cudaMalloc((void**)&deviceDenseTheta, sizeof(int)*UpdateNumWorkers*k);
89 |
90 | long long totalByte = sizeof(short)*chunkNNZ +
91 | sizeof(short)*chunkNNZ +
92 | sizeof(int)*(numDocs + 1) +
93 | sizeof(int)*numDocs +
94 | sizeof(int)*UpdateNumWorkers*k;
95 | printf("theta chunk size:%.3f GB\n",totalByte/(1024.0*1024.0*1024.0));
96 |
97 | //exit(0);
98 |
99 | cudaDeviceSynchronize();
100 | gpuErr(cudaPeekAtLastError());
101 |
102 | //transfer MaxIA
103 | toGPU();
104 | }
105 |
106 | void ModelThetaChunk::UpdateThetaGPU(Document &doc, cudaStream_t stream)
107 | {
108 |
109 | cudaSetDevice(chunkId);
110 |
111 | cudaDeviceSynchronize();
112 | gpuErr(cudaPeekAtLastError());
113 |
114 | cudaMemsetAsync(deviceThetaA, 0, sizeof(short)*chunkNNZ, stream);
115 | cudaMemsetAsync(deviceThetaJA, 0, sizeof(short)*chunkNNZ, stream);
116 | cudaDeviceSynchronize();
117 | gpuErr(cudaPeekAtLastError());
118 |
119 | LDAUpdateThetaAPI(
120 | k,
121 | numDocs,
122 | chunkNumDocs,
123 | docIdStart,
124 | docIdEnd,
125 | doc.docChunkVec[chunkId]->deviceWordIndices,
126 | doc.docChunkVec[chunkId]->deviceWordTokens,
127 | doc.docChunkVec[chunkId]->deviceWordTopics,
128 | doc.docChunkVec[chunkId]->deviceDocRevIndices,
129 | doc.docChunkVec[chunkId]->deviceDocRevIdx,
130 | deviceThetaA,
131 | deviceThetaCurIA,
132 | deviceThetaMaxIA,
133 | deviceThetaJA,
134 | deviceDenseTheta,
135 | stream
136 | );
137 |
138 | cudaDeviceSynchronize();
139 | gpuErr(cudaPeekAtLastError());
140 | }
141 |
142 | void ModelThetaChunk::validTheta(Document &doc)
143 | {
144 |
145 | printf("Calling ModelThetaChunk::validTheta() for chunk %d ...\n", chunkId);
146 | clock_t clockStart = clock();
147 |
148 |
149 | printf("theta zero check\n");
150 | for(int docId = docIdStart; docId < docIdEnd; docId ++)
151 | {
152 |
153 | int foundFlag = 0, errorFlag = 0;
154 | long long tmpStart = hostThetaMaxIA[docId];
155 | long long tmpEnd = hostThetaMaxIA[docId + 1];
156 |
157 | if(tmpStart%32 != 0){
158 | printf("tmpStart non-aligned error\n");
159 | exit(0);
160 |
161 | }
162 | if(tmpEnd%32 != 0){
163 | printf("tmpEnd non-aligned error\n");
164 | exit(0);
165 | }
166 |
167 | for(long long tmpIdx = tmpStart; tmpIdx < tmpEnd; tmpIdx ++){
168 |
169 | if(hostThetaJA[tmpIdx] == 0 && hostThetaA[tmpIdx] != 0)
170 | {
171 | if(foundFlag == 1)errorFlag = 1;
172 | else foundFlag = 1;
173 | }
174 | }
175 |
176 | if(errorFlag == 1)
177 | {
178 | printf("error in validTheta\n");
179 |
180 | for(long long tmpIdx = tmpStart; tmpIdx < tmpEnd; tmpIdx ++){
181 | printf("IA(%lld), JA(%d), A(%d)\n", tmpIdx, hostThetaJA[tmpIdx], hostThetaA[tmpIdx]);
182 | }
183 | exit(0);
184 | }
185 | }
186 |
187 |
188 | //doc by doc check
189 | int *tmpThetaArray = new int[k]();
190 | int *tmpMask = new int[k]();
191 | for(int docId = docIdStart;docId < docIdEnd; docId ++){
192 |
193 | //generate the dense array
194 | for(int i = 0;i < k;i++){
195 | tmpThetaArray[i] = 0;
196 | tmpMask[i] = 0;
197 | }
198 |
199 | for(long long revIdx = doc.docChunkVec[chunkId]->docRevIndices[docId];
200 | revIdx < doc.docChunkVec[chunkId]->docRevIndices[docId + 1];
201 | revIdx ++){
202 |
203 | int tokenIdx = doc.docChunkVec[chunkId]->docRevIdx[revIdx];
204 | int tmpTopic = doc.docChunkVec[chunkId]->wordTopics[tokenIdx];
205 | tmpThetaArray[tmpTopic] ++;
206 | }
207 |
208 | long long tmpStart = hostThetaMaxIA[docId];
209 | long long tmpEnd = hostThetaMaxIA[docId + 1];
210 |
211 | //round 1, check non-zero element
212 | for(long long tmpIdx = tmpStart; tmpIdx < tmpEnd; tmpIdx ++){
213 | int tmpK = hostThetaJA[tmpIdx];
214 | int tmpVal = hostThetaA[tmpIdx];
215 |
216 | if(tmpVal == 0)continue;
217 |
218 | tmpMask[tmpK] = 1;
219 | if(tmpThetaArray[tmpK] != tmpVal){
220 | printf("ValidTheta Error 1: docId(%d), topic(%d), tmpTheta(%d), theta(%d), IA(%lld)\n",
221 | docId,
222 | tmpK,
223 | tmpThetaArray[tmpK],
224 | tmpVal,
225 | tmpIdx);
226 |
227 | printf("tmpTheta:\n");
228 |
229 | for(int j = 0;j < k/32;j ++){
230 | printf("%2d:",j);
231 | for(int m = 0;m < 32;m ++){
232 | printf("%d,",tmpThetaArray[j*32 + m]);
233 | }
234 | printf("\n");
235 | }
236 | exit(0);
237 | }
238 | }
239 |
240 | //round2, check zero element
241 | for(int tmpK = 0;tmpK < k; tmpK ++){
242 | if(tmpMask[tmpK] == 1)continue;
243 |
244 | if(tmpThetaArray[tmpK] != 0){
245 | printf("ValidTheta Error 2: docId(%d), topic(%d), val(%d)\n",
246 | docId, tmpK, tmpThetaArray[tmpK]);
247 |
248 | printf("docId:(%d)\n", docId);
249 | for(int idx = hostThetaMaxIA[docId]; idx < hostThetaMaxIA[docId +1]; idx ++){
250 | printf("IA(%d), JA(%d), A(%d)\n", idx, hostThetaJA[idx], hostThetaA[idx]);
251 |
252 | if(hostThetaJA[idx] == 0 && hostThetaA[idx] == 0)break;
253 | }
254 |
255 | printf("tmpTheta:\n");
256 | for(int j = 0;j < k/32;j ++){
257 | printf("%2d:",j);
258 | for(int m = 0;m < 32;m ++){
259 | printf("%d,",tmpThetaArray[j*32 + m]);
260 | }
261 | printf("\n");
262 | }
263 | exit(0);
264 | }
265 | }
266 | }
267 |
268 | printf("Validate Theta passed ...\n");
269 | printf("ValidateTheta time:%.2fs\n", (clock() - clockStart)/(double)CLOCKS_PER_SEC);
270 |
271 | delete []tmpThetaArray;
272 | delete []tmpMask;
273 | }
274 |
275 | void ModelThetaChunk::toGPU()
276 | {
277 | //printf("ModelThetaChunk(%d)::toGPU() ...\n", chunkId);
278 |
279 | cudaMemcpy(deviceThetaMaxIA,
280 | hostThetaMaxIA,
281 | sizeof(int)*(numDocs + 1),
282 | cudaMemcpyHostToDevice);
283 | //gpuErr(cudaPeekAtLastError());
284 |
285 | //printf("ModelThetaChunk::toGPU() finished ...\n\n");
286 | }
287 |
288 | void ModelThetaChunk::toCPU()
289 | {
290 | //printf("ModelThetaChunk::thetaToCPU() ...\n");
291 | cudaDeviceSynchronize();
292 | gpuErr(cudaPeekAtLastError());
293 |
294 | //theta
295 | cudaMemcpy(hostThetaA,
296 | deviceThetaA,
297 | sizeof(short)*chunkNNZ,
298 | cudaMemcpyDeviceToHost);
299 | cudaDeviceSynchronize();
300 | gpuErr(cudaPeekAtLastError());
301 |
302 | cudaMemcpy(hostThetaJA,
303 | deviceThetaJA,
304 | sizeof(short)*chunkNNZ,
305 | cudaMemcpyDeviceToHost);
306 |
307 | cudaMemcpy(hostThetaMaxIA,
308 | deviceThetaMaxIA,
309 | sizeof(int)*(numDocs + 1),
310 | cudaMemcpyDeviceToHost);
311 | gpuErr(cudaPeekAtLastError());
312 |
313 | cudaMemcpy(hostThetaCurIA,
314 | deviceThetaCurIA,
315 | sizeof(int)*numDocs,
316 | cudaMemcpyDeviceToHost);
317 | gpuErr(cudaPeekAtLastError());
318 |
319 | cudaDeviceSynchronize();
320 | gpuErr(cudaPeekAtLastError());
321 | //printf("ModelThetaChunk::thetaToCPU() finished ...\n");
322 |
323 | }
324 |
325 | void ModelThetaChunk::clearPtr()
326 | {
327 | if(deviceThetaA != NULL) cudaFree(deviceThetaA);
328 | if(deviceThetaJA != NULL) cudaFree(deviceThetaJA);
329 | if(deviceThetaMaxIA != NULL) cudaFree(deviceThetaMaxIA);
330 | if(deviceThetaCurIA != NULL) cudaFree(deviceThetaCurIA);
331 | if(deviceDenseTheta != NULL) cudaFree(deviceDenseTheta);
332 |
333 | //CPU data release
334 | if(hostThetaA != NULL) delete []hostThetaA;
335 | if(hostThetaJA != NULL) delete []hostThetaJA;
336 | if(hostThetaMaxIA != NULL) delete []hostThetaMaxIA;
337 | if(hostThetaCurIA != NULL) delete []hostThetaCurIA;
338 |
339 | deviceThetaA = NULL;
340 | deviceThetaJA = NULL;
341 | deviceThetaMaxIA = NULL;
342 | deviceThetaCurIA = NULL;
343 | deviceDenseTheta = NULL;
344 |
345 | hostThetaA = NULL;
346 | hostThetaJA = NULL;
347 | hostThetaMaxIA = NULL;
348 | hostThetaCurIA = NULL;
349 | }
350 |
351 |
--------------------------------------------------------------------------------
/src_culda/model/model_theta_chunk.h:
--------------------------------------------------------------------------------
1 | #ifndef _GPU_THETA_CHUNK_H_
2 |
3 | #define _GPU_THETA_CHUNK_H_
4 |
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 | #include
11 | #include "culda_argument.h"
12 | #include "vocab.h"
13 | #include "doc.h"
14 |
15 | #include "../kernel/lda_train_kernel.h"
16 | #include "../kernel/lda_theta_kernel.h"
17 |
18 | using namespace std;
19 |
20 | class ModelThetaChunk
21 | {
22 | public:
23 | int k;
24 | int numDocs;
25 | int numWords;
26 | int numChunks;
27 |
28 | int chunkId;
29 | int docIdStart;
30 | int docIdEnd;
31 | int chunkNumDocs;
32 |
33 | int chunkNNZ;
34 |
35 | //GPU data
36 | short *deviceThetaA; //chunkNNZ
37 | short *deviceThetaJA; //chunkNNZ
38 | int *deviceThetaCurIA; //numDocs
39 | int *deviceThetaMaxIA; //numDocs + 1
40 | int *deviceDenseTheta; //chunkNumDocs + 1
41 |
42 | //CPU data
43 | short *hostThetaA; //chunkNNZ
44 | short *hostThetaJA; //chunkNNZ
45 | int *hostThetaCurIA; //numDocs
46 | int *hostThetaMaxIA; //numDocs + 1
47 |
48 | ModelThetaChunk();
49 | ModelThetaChunk(int, int, int, int, int, int, int, int, int);
50 |
51 |
52 | void InitData(const vector &);
53 | void toGPU();
54 | void toCPU();
55 | void UpdateThetaGPU(Document &, cudaStream_t stream=0);
56 | void validTheta(Document&);
57 |
58 | void clearPtr();
59 | };
60 |
61 | #endif
--------------------------------------------------------------------------------
/src_culda/model/vocab.cpp:
--------------------------------------------------------------------------------
1 |
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #include
10 |
11 | #include "vocab.h"
12 |
13 |
14 | //Definitions of methods of class Voculary
15 | void Vocabulary::clear()
16 | {
17 | wordList.clear();
18 | wordSet.clear();
19 | }
20 |
21 | bool Vocabulary::hasWord(const string & token)
22 | {
23 | return wordSet.find(Word(token,0)) != wordSet.end();
24 | }
25 |
26 | void Vocabulary::insertWord(const string & token)
27 | {
28 | if(!hasWord(token)){
29 | Word tmpWord(token, wordList.size());
30 | wordList.push_back(tmpWord);
31 | wordSet.insert(tmpWord);
32 | }
33 | }
34 |
35 | string Vocabulary::getWordById(int argId)
36 | {
37 |
38 | if(argId >= wordList.size())cout << "overflow:" << argId << endl;
39 | return wordList[argId].token;
40 | }
41 |
42 | int Vocabulary::getIdByWord(string argToken)
43 | {
44 | set::iterator setIte = wordSet.find(argToken);
45 | if(setIte == wordSet.end())return -1;
46 | else return setIte->id;
47 | }
48 |
49 | void Vocabulary::loadVocab(string fileName)
50 | {
51 |
52 | clear();
53 | ifstream inputFile(fileName.c_str(), ios::in);
54 |
55 | if(!inputFile.is_open()){
56 | cout << "Vocabulary file " << fileName << " open failed" << endl;
57 | exit(0);
58 | }
59 |
60 | string token;
61 | int id;
62 | while(inputFile >> token >> id){
63 | insertWord(token);
64 | }
65 | }
66 |
67 | void Vocabulary::writeVocab(string fileName)
68 | {
69 |
70 | ofstream outputFile(fileName.c_str(), ios::out);
71 |
72 | if(!outputFile.is_open()){
73 | cout << "Vocabulary file " << fileName << " open failed" << endl;
74 | exit(0);
75 | }
76 |
77 | for(int i = 0;i < wordList.size(); i++)
78 | outputFile << wordList[i].token << " " << wordList[i].id << endl;
79 |
80 | }
81 |
82 | void Vocabulary::printVocabAbbr()
83 | {
84 | printf("----vocab info-----\n");
85 | printf("numWords:%d\n", wordList.size());
86 |
87 | }
88 |
89 | void Vocabulary::printVocabFull()
90 | {
91 |
92 | for(int i = 0;i < wordList.size();i++)
93 | cout << "(" << wordList[i].token << "," << wordList[i].id << ")" << endl;
94 |
95 | }
96 |
97 | Vocabulary::Vocabulary(const string &fname)
98 | {
99 | loadVocab(fname);
100 | }
101 |
102 |
103 |
--------------------------------------------------------------------------------
/src_culda/model/vocab.h:
--------------------------------------------------------------------------------
1 | #ifndef _VOCAB_H_
2 |
3 | #define _VOCAB_H_
4 |
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 | using namespace std;
11 |
12 | class Word{
13 | public:
14 | string token;
15 | int id;
16 | bool operator < (const Word & arg) const{
17 | return token.compare(arg.token) < 0;
18 | }
19 | bool operator > (const Word & arg) const{
20 | return token.compare(arg.token) > 0;
21 | }
22 | bool operator == (const Word & arg) const{
23 | return token.compare(arg.token) == 0;
24 | }
25 |
26 | Word(){}
27 | Word(string argString){ token = argString;}
28 | Word(string argString, int argId){ token = argString; id = argId;}
29 | };
30 |
31 |
32 | class Vocabulary
33 | {
34 | public:
35 | std::vector wordList;
36 | std::set wordSet;
37 |
38 | Vocabulary(){}
39 | Vocabulary(const string &fname);
40 |
41 | void clear();
42 | bool hasWord(const string & token);
43 | void insertWord(const string & token);
44 |
45 | string getWordById(int argId);
46 | int getIdByWord(string argToken);
47 |
48 | void loadVocab(string fname);
49 | void writeVocab(string fname);
50 |
51 | void printVocabAbbr();
52 | void printVocabFull();
53 | };
54 |
55 |
56 | #endif
--------------------------------------------------------------------------------
/src_culda/train/MultiChunkMultiGPUequal.h:
--------------------------------------------------------------------------------
1 | #ifndef _MultiChunkMultiGPUequal_H_
2 | #define _MultiChunkMultiGPUequal_H_
3 |
4 |
5 | void static MultiChunkMultiGPUequal(Document &doc, Vocabulary &vocab, Argument &arg,
6 | ModelPhi &modelPhi, ModelTheta &modelTheta)
7 | {
8 |
9 | /* data preparation and transfer */
10 | gpuErr(cudaPeekAtLastError());
11 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++){
12 | doc.docChunkVec[chunkId]->allocGPU(chunkId);
13 | doc.docChunkVec[chunkId]->toGPU();
14 | }
15 | printf("\n");
16 |
17 | /* modelphi */
18 | modelPhi.InitData(doc);
19 | printf("doc.numChunks:%d\n", doc.numChunks);
20 | for(int i = 0; i < doc.numChunks; i++)
21 | modelPhi.UpdatePhiGPU(doc, i);
22 |
23 | cudaDeviceSynchronize();
24 | gpuErr(cudaPeekAtLastError());
25 |
26 | for(int i = 1;i < arg.numGPUs; i++){
27 | modelPhi.MasterGPUCollect(i);
28 | modelPhi.MasterGPUReduce();
29 | }
30 | for(int i = 1;i < arg.numGPUs; i++)
31 | modelPhi.MasterGPUDistribute(i);
32 |
33 | modelPhi.MasterGPUToCPU();
34 | modelPhi.validPhi(doc);
35 | modelPhi.UpdatePhiHead(arg.beta);
36 |
37 | cudaDeviceSynchronize();
38 | gpuErr(cudaPeekAtLastError());
39 |
40 | /* model theta */
41 | modelTheta.InitData(doc); //alloc GPU+CPU memory space.
42 | modelTheta.UpdateThetaGPU(doc);
43 | modelTheta.toCPU();
44 | //modelTheta.validTheta(doc);
45 |
46 | cudaDeviceSynchronize();
47 | gpuErr(cudaPeekAtLastError());
48 |
49 |
50 | /* prepare the randstate, used for random sampling. */
51 |
52 |
53 | int randStateSize = 256;
54 | curandState *deviceRandState[MaxNumGPU];
55 | for(int i = 0;i < arg.numGPUs;i++){
56 | cudaSetDevice(i);
57 | cudaMalloc(&deviceRandState[i], sizeof(curandState)*randStateSize);
58 | initRandState<<>>(deviceRandState[i]);
59 | }
60 |
61 | cudaStream_t mainStream[MaxNumGPU];
62 | cudaStream_t branStream[MaxNumGPU];
63 | pthreadArgTheta thetaArgs[MaxNumGPU];
64 | pthread_t threads[MaxNumGPU];
65 | for(int i = 0;i < arg.numGPUs;i++){
66 | cudaSetDevice(i);
67 | cudaStreamCreate(&mainStream[i]);
68 | cudaStreamCreate(&branStream[i]);
69 |
70 | thetaArgs[i].mainStream = mainStream[i];
71 | thetaArgs[i].branStream = branStream[i];
72 | thetaArgs[i].thetaPtr = &modelTheta;
73 | thetaArgs[i].id = i;
74 | thetaArgs[i].docPtr = &doc;
75 | }
76 |
77 |
78 |
79 | cudaDeviceSynchronize();
80 | gpuErr(cudaPeekAtLastError());
81 |
82 | //launch kernels
83 |
84 | struct timespec begin, end;
85 | double elapsed = 0, stamp = 0;
86 |
87 | printf("Launching Sampling Part ...\n");
88 | for(int ite = 0;ite < arg.iteration; ite++){
89 |
90 | //printf("Iteration %3d:", ite + 1);
91 |
92 | clock_gettime(CLOCK_MONOTONIC, &begin);
93 |
94 |
95 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++){
96 |
97 | cudaSetDevice(chunkId);
98 |
99 | cudaDeviceSynchronize();
100 | gpuErr(cudaPeekAtLastError());
101 |
102 | //LDAKernelTrain<<>>(
103 | LDAKernelTrain<<numSlots, TrainBlockSize, 0, mainStream[chunkId]>>>(
104 | arg.k,
105 | arg.alpha,
106 | arg.beta,
107 | doc.numDocs,
108 | doc.numWords,
109 | doc.docChunkVec[chunkId]->chunkNumTokens,
110 | doc.docChunkVec[chunkId]->deviceWordIndices,
111 | doc.docChunkVec[chunkId]->deviceSlotIdToWordId,
112 | doc.docChunkVec[chunkId]->deviceSlotIndices,
113 | doc.docChunkVec[chunkId]->deviceWordTokens,
114 | doc.docChunkVec[chunkId]->deviceWordTopics,
115 | modelTheta.thetaChunkVec[chunkId]->deviceThetaA,
116 | modelTheta.thetaChunkVec[chunkId]->deviceThetaMaxIA,
117 | modelTheta.thetaChunkVec[chunkId]->deviceThetaCurIA,
118 | modelTheta.thetaChunkVec[chunkId]->deviceThetaJA,
119 | modelTheta.thetaChunkVec[chunkId]->docIdStart,
120 | modelPhi.phiChunkVec[chunkId]->devicePhiTopicWordShort,
121 | modelPhi.phiChunkVec[chunkId]->devicePhiTopic,
122 | modelPhi.phiChunkVec[chunkId]->devicePhiHead,
123 | deviceRandState[chunkId],
124 | randStateSize,
125 | chunkId,
126 | doc.docChunkVec[chunkId]->deviceWordPerplexity,
127 | doc.docChunkVec[chunkId]->deviceDocRevIndices
128 | );
129 | }
130 | for(int i = 0;i < arg.numGPUs; i++)
131 | modelPhi.UpdatePhiGPU(doc, i, mainStream[i]);
132 |
133 |
134 | double logLike = LDATrainPerplexity(doc, mainStream);
135 | //printf("log likelyhood :%.8f\n", logLike);
136 |
137 | for(int i = 0;i < arg.numGPUs;i++){
138 | pthread_create(&(threads[i]),
139 | NULL,
140 | UpdateThetaThread,
141 | (void*)(&(thetaArgs[i])));
142 | //pthread_join(threads[i], NULL);
143 | }
144 |
145 |
146 | for(int i = 1;i < arg.numGPUs; i++){
147 | cudaStreamSynchronize(mainStream[i]);
148 | modelPhi.MasterGPUCollect(i, mainStream[0]);
149 | modelPhi.MasterGPUReduce(mainStream[0]);
150 | }
151 |
152 | cudaStreamSynchronize(mainStream[0]);
153 |
154 | for(int i = 1;i < arg.numGPUs; i++)
155 | modelPhi.MasterGPUDistribute(i, mainStream[i]);
156 |
157 | modelPhi.UpdatePhiHead(arg.beta, mainStream);
158 |
159 | for(int i = 0;i < arg.numGPUs;i++)
160 | pthread_join(threads[i], NULL);
161 |
162 | cudaDeviceSynchronize();
163 |
164 | clock_gettime(CLOCK_MONOTONIC, &end);
165 | stamp = end.tv_sec - begin.tv_sec;
166 | stamp += (end.tv_nsec - begin.tv_nsec) / 1000000000.0;
167 | elapsed += stamp;
168 |
169 | printf("Iteration, %d,%.3f sec,%.3f sec, %.8f, %.3f M\n", ite+1,elapsed, stamp, logLike, doc.numTokens/stamp/1000000);
170 |
171 | if((ite + 1)%20 == 0) sleep(60);
172 | }
173 |
174 | /*
175 | cudaDeviceSynchronize();
176 | gpuErr(cudaPeekAtLastError());
177 |
178 |
179 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++)
180 | doc.docChunkVec[chunkId]->toCPU();
181 | printf("\n");
182 | modelTheta.toCPU();
183 | modelTheta.validTheta(doc);
184 |
185 | //modelPhi.MasterGPUToCPU();
186 | //modelPhi.validPhi(doc);
187 |
188 | for(int i = 0;i < arg.numChunks; i++)cudaFree(deviceRandState[i]);
189 | cudaDeviceSynchronize();
190 | gpuErr(cudaPeekAtLastError());
191 | */
192 | }
193 |
194 | #endif
195 |
--------------------------------------------------------------------------------
/src_culda/train/SingleChunkSingleGPU.h:
--------------------------------------------------------------------------------
1 | #ifndef _SingleChunkSingleGPU_H_
2 | #define _SingleChunkSingleGPU_H_
3 |
4 |
5 | void static SingleChunkSingleGPU(Document &doc, Vocabulary &vocab, Argument &arg,
6 | ModelPhi &modelPhi, ModelTheta &modelTheta)
7 | {
8 |
9 | /* data preparation and transfer */
10 |
11 | printf("Call SingleChunkSingleGPU() ...\n");
12 |
13 |
14 | printf("alloc gpu for doc ...\n");
15 | doc.docChunkVec[0]->allocGPU(0);
16 | printf("to gpu for doc ...\n");
17 | doc.docChunkVec[0]->toGPU();
18 |
19 | cudaDeviceSynchronize();
20 | gpuErr(cudaPeekAtLastError());
21 |
22 | /* model phi */
23 | printf("Prepare model phi ...\n");
24 | modelPhi.InitData(doc);
25 | modelPhi.UpdatePhiGPU(doc, 0);
26 | modelPhi.UpdatePhiHead(arg.beta);
27 | //modelPhi.MasterGPUToCPU();
28 | //modelPhi.validPhi(doc);
29 |
30 |
31 | /* model theta */
32 | printf("Prepare model theta ...\n");
33 | cudaDeviceSynchronize();
34 | gpuErr(cudaPeekAtLastError());
35 |
36 | modelTheta.InitData(doc);
37 |
38 | cudaDeviceSynchronize();
39 | gpuErr(cudaPeekAtLastError());
40 |
41 | modelTheta.UpdateThetaGPU(doc);
42 |
43 | //exit(0);
44 |
45 | cudaDeviceSynchronize();
46 | gpuErr(cudaPeekAtLastError());
47 |
48 | //modelTheta.toCPU();
49 | //modelTheta.validTheta(doc);
50 |
51 | //exit(0);
52 |
53 |
54 | /* prepare the randstate */
55 | int randStateSize = 256*20;
56 | curandState *deviceRandState[MaxNumGPU];
57 | cudaMalloc(&deviceRandState[0], sizeof(curandState)*randStateSize);
58 | initRandState<<>>(deviceRandState[0]);
59 |
60 | cudaStream_t extraStream;
61 | cudaStreamCreate(&extraStream);
62 |
63 | cudaDeviceSynchronize();
64 | gpuErr(cudaPeekAtLastError());
65 |
66 | struct timespec begin, end;
67 | double elapsed = 0, stamp = 0;
68 |
69 | //launch train kernels
70 | for(int ite = 0;ite < arg.iteration; ite++)
71 | {
72 | clock_gettime(CLOCK_MONOTONIC, &begin);
73 |
74 | //numBlocks = 100;
75 | LDAKernelTrain<<numSlots, TrainBlockSize>>>(
76 | arg.k,
77 | arg.alpha,
78 | arg.beta,
79 | doc.numDocs,
80 | doc.numWords,
81 | doc.docChunkVec[0]->chunkNumTokens,
82 | doc.docChunkVec[0]->deviceWordIndices,
83 | doc.docChunkVec[0]->deviceSlotIdToWordId,
84 | doc.docChunkVec[0]->deviceSlotIndices,
85 | doc.docChunkVec[0]->deviceWordTokens,
86 | doc.docChunkVec[0]->deviceWordTopics,
87 | modelTheta.thetaChunkVec[0]->deviceThetaA,
88 | modelTheta.thetaChunkVec[0]->deviceThetaMaxIA,
89 | modelTheta.thetaChunkVec[0]->deviceThetaCurIA,
90 | modelTheta.thetaChunkVec[0]->deviceThetaJA,
91 | modelTheta.thetaChunkVec[0]->docIdStart,
92 | modelPhi.phiChunkVec[0]->devicePhiTopicWordShort,
93 | modelPhi.phiChunkVec[0]->devicePhiTopic,
94 | modelPhi.phiChunkVec[0]->devicePhiHead,
95 | deviceRandState[0],
96 | randStateSize, //arg.numWorkers,
97 | 0,
98 | doc.docChunkVec[0]->deviceWordPerplexity,
99 | doc.docChunkVec[0]->deviceDocRevIndices
100 | );
101 |
102 | //cudaDeviceSynchronize();
103 | //gpuErr(cudaPeekAtLastError());
104 |
105 | double logLike = LDATrainPerplexity(doc);
106 | //cudaDeviceSynchronize();
107 | //gpuErr(cudaPeekAtLastError());
108 |
109 | //doc.docChunkVec[0]->toCPU();
110 |
111 | modelPhi.UpdatePhiGPU(doc, 0);
112 | modelPhi.UpdatePhiHead(arg.beta);
113 | //modelPhi.MasterGPUToCPU();
114 | //modelPhi.validPhi(doc);
115 |
116 | modelTheta.UpdateThetaGPU(doc);
117 | //modelTheta.toCPU();
118 | //modelTheta.validTheta(doc);
119 |
120 | cudaDeviceSynchronize();
121 |
122 | clock_gettime(CLOCK_MONOTONIC, &end);
123 | stamp = end.tv_sec - begin.tv_sec;
124 | stamp += (end.tv_nsec - begin.tv_nsec) / 1000000000.0;
125 | elapsed += stamp;
126 |
127 | printf("Iteration %3d: %6.2f sec, %3.2f sec, logLikelyhood = %.8f, %5.3f M\n", ite+1,elapsed, stamp, logLike, doc.numTokens/stamp/1000000);
128 | cudaDeviceSynchronize();
129 | gpuErr(cudaPeekAtLastError());
130 |
131 | // if((ite + 1)%30 == 0)sleep(120);
132 |
133 | }
134 |
135 | cudaDeviceSynchronize();
136 | gpuErr(cudaPeekAtLastError());
137 |
138 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++)
139 | doc.docChunkVec[chunkId]->toCPU();
140 | printf("\n");
141 |
142 | cudaDeviceSynchronize();
143 | gpuErr(cudaPeekAtLastError());
144 |
145 | modelTheta.toCPU();
146 | //modelTheta.validTheta(doc);
147 |
148 | modelPhi.MasterGPUToCPU();
149 | //cudaDeviceSynchronize();
150 | //modelPhi.validPhi(doc);
151 |
152 |
153 |
154 | cudaDeviceSynchronize();
155 | gpuErr(cudaPeekAtLastError());
156 | //modelPhi.savePhi("phi.data");
157 | }
158 |
159 |
160 | #endif
161 |
--------------------------------------------------------------------------------
/src_culda/train/lda_train.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 |
14 | #include
15 |
16 | #include "../model/vocab.h"
17 | #include "../model/doc.h"
18 | #include "lda_train.h"
19 | #include "../kernel/lda_train_kernel.h"
20 |
21 |
22 | struct pthreadArgTheta
23 | {
24 | int id;
25 | Document *docPtr;
26 | ModelTheta *thetaPtr;
27 | cudaStream_t mainStream;
28 | cudaStream_t branStream;
29 | };
30 |
31 | static void *UpdateThetaThread(void *arg)
32 | {
33 | pthreadArgTheta *localArg = (pthreadArgTheta*)arg;
34 | cudaStreamSynchronize(localArg->mainStream);
35 | localArg->thetaPtr->thetaChunkVec[localArg->id]->UpdateThetaGPU(*(localArg->docPtr), localArg->branStream);
36 |
37 | return NULL;
38 | }
39 |
40 | #include "SingleChunkSingleGPU.h"
41 | #include "MultiChunkMultiGPUequal.h"
42 | //#include "MultiChunkMultiGPUNotequal.h"
43 |
44 | void LDATrain(Document &doc, Vocabulary &vocab, Argument &arg)
45 | {
46 | if(arg.numGPUs > doc.numChunks) arg.numGPUs = doc.numChunks;
47 |
48 | //ModelPhi preparation.
49 | ModelPhi modelPhi(arg.k, arg.numGPUs, doc.numDocs, doc.numWords, doc.numChunks);
50 | ModelTheta modelTheta(arg.k, doc.numDocs, doc.numWords, doc.numChunks);
51 |
52 | if(doc.numChunks == 1) //One chunk, one GPU.
53 | SingleChunkSingleGPU(doc, vocab, arg, modelPhi, modelTheta);
54 | else if(doc.numChunks != 1 && arg.numGPUs != 1 && arg.numGPUs == doc.numChunks)
55 | MultiChunkMultiGPUequal(doc, vocab, arg, modelPhi, modelTheta);
56 |
57 | }
58 |
--------------------------------------------------------------------------------
/src_culda/train/lda_train.h:
--------------------------------------------------------------------------------
1 |
2 | #ifndef _LDA_TRAIN_H_
3 | #define _LDA_TRAIN_H_
4 |
5 |
6 | #include "../model/model_theta.h"
7 | #include "../model/model_phi.h"
8 | #include "../model/culda_argument.h"
9 | #include "../model/vocab.h"
10 | #include "../model/doc.h"
11 |
12 | #include "../kernel/lda_train_kernel.h"
13 |
14 | void LDATrain(Document &doc, Vocabulary &vocab, Argument &argu);
15 |
16 | double LDALikelihood(Argument&, Document&, ModelTheta&, ModelPhi&);
17 |
18 | #endif
--------------------------------------------------------------------------------
/src_format/.gitignore:
--------------------------------------------------------------------------------
1 | 2.idx
2 | data_format.o
3 | format
4 | format.o
5 | train.*
6 | vocab.*
7 |
--------------------------------------------------------------------------------
/src_format/Makefile:
--------------------------------------------------------------------------------
1 |
2 | format: format.o vocab.o
3 | nvcc -o format format.o vocab.o
4 |
5 | vocab.o:../src_culda/vocab.cpp ../src_culda/vocab.h
6 | nvcc -c ../src_culda/vocab.cpp -o vocab.o
7 |
8 | format.o:format.cpp ../src_culda/vocab.h data_chunk.h
9 | nvcc -c format.cpp -o format.o
10 |
11 | clean:
12 | rm -f *.o format
--------------------------------------------------------------------------------
/src_format/data_chunk.h:
--------------------------------------------------------------------------------
1 | #ifndef _DATA_CHUNK_H_
2 | #define _DATA_CHUNK_H_
3 |
4 | #include
5 | class DataChunk{
6 |
7 | public:
8 | int numWords;
9 | int numChunks;
10 | int chunkId;
11 |
12 | int docIdStart;
13 | int docIdEnd;
14 |
15 | int chunkDocSize;
16 | long long chunkTokenSize;
17 |
18 | string outFilePrefix;
19 | string outFileIdxName;
20 | string outFileDataName;
21 |
22 | vector > wordFirstVec;
23 |
24 | DataChunk(int argNumWords, int argNumChunks, int argChunkId, string argFilePrefix)
25 | {
26 | numWords = argNumWords;
27 | numChunks = argNumChunks;
28 | chunkId = argChunkId;
29 | outFilePrefix = argFilePrefix;
30 |
31 | for(int i = 0;i < numWords;i++)
32 | wordFirstVec.push_back(vector());
33 |
34 | stringstream tmpNameStream;
35 | outFileIdxName = outFilePrefix + ".word.idx";
36 | outFileDataName = outFilePrefix + ".word.data";
37 | if(numChunks > 1){
38 | tmpNameStream << outFileIdxName << chunkId;
39 | tmpNameStream >> outFileIdxName;
40 | tmpNameStream.clear();
41 | tmpNameStream << outFileDataName << chunkId;
42 | tmpNameStream >> outFileDataName;
43 | }
44 |
45 | docIdStart = 0;
46 | docIdEnd = 0;
47 | }
48 |
49 | void writeChunk()
50 | {
51 | ofstream outFileIdxStream(outFileIdxName.c_str(), ios::out);
52 | ofstream outFileDataStream(outFileDataName.c_str(), ios::out|ios::binary);
53 |
54 | long long offset = 0;
55 | for(int wordId = 0; wordId < wordFirstVec.size();wordId++){
56 | offset += wordFirstVec[wordId].size();
57 | outFileIdxStream << wordId << " " << offset << endl;
58 |
59 | for(int localTokenId = 0; localTokenId < wordFirstVec[wordId].size(); localTokenId ++){
60 | int tmpWord = wordFirstVec[wordId][localTokenId];
61 | outFileDataStream.write((char*)&(tmpWord), sizeof(int));
62 | }
63 | }
64 |
65 | outFileIdxStream.close();
66 | outFileDataStream.close();
67 | }
68 | ~DataChunk()
69 | {
70 | }
71 |
72 | };
73 |
74 | #endif
--------------------------------------------------------------------------------
/src_format/format.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | #include
8 |
9 | //#include "../src_culda/data_format.h"
10 | #include "../src_culda/model/vocab.h"
11 | #include "../src_culda/model/culda_argument.h"
12 |
13 |
14 | #include "data_chunk.h"
15 |
16 | using namespace std;
17 |
18 |
19 | int main(int argc, char**argv)
20 | {
21 |
22 | int numChunks = 0;
23 | if(argc != 3 && argc != 4){
24 | cout << "Usage:./format input output_prefix numChunks[default=1]" << endl;
25 | return 0;
26 | }
27 | if(argc == 4) numChunks = atoi(argv[3]);
28 | if(numChunks <= 1)numChunks = 1;
29 |
30 | //open file
31 | ifstream inputFile(argv[1], ios::in);
32 |
33 | if(!inputFile.is_open()){
34 | cout << argv[1] << " open failed" << endl;;
35 | exit(0);
36 | }
37 |
38 | ofstream outFileVocab((argv[2] + string(".vocab")).c_str(), ios::out);
39 | ofstream outFileChunk((argv[2] + string(".chunk")).c_str(), ios::out);
40 | ofstream outFileDocIdx((argv[2] + string(".doc.idx")).c_str(), ios::out);
41 |
42 | if(!outFileVocab.is_open()){
43 | cout << argv[2] << ".vocab open failed" << endl;;
44 | exit(0);
45 | }
46 | if(!outFileChunk.is_open()){
47 | cout << argv[2] << ".chunk open failed" << endl;;
48 | exit(0);
49 | }
50 | if(!outFileDocIdx.is_open()){
51 | cout << argv[2] << ".doc.idx open failed" << endl;;
52 | exit(0);
53 | }
54 |
55 | printf("reading input file ...\n");
56 | //read input file
57 | Vocabulary vocab;
58 | long long numTokens = 0;
59 | vector > wordFirstVec;
60 | vector docLengthVec;
61 | int docId = 0;
62 |
63 | string docLine;
64 | while(getline(inputFile, docLine)){
65 |
66 | string docTitle, token;
67 |
68 | std::istringstream docStream(docLine);
69 | if(!(docStream >> docTitle))continue;
70 |
71 | int docLength = 0;
72 | while(docStream >> token){
73 |
74 | docLength ++;
75 | vocab.insertWord(token);
76 | int id = vocab.getIdByWord(token);
77 | numTokens ++;
78 |
79 | //insert it to wordFirstVec
80 | if(wordFirstVec.size() >= id){
81 |
82 | int numLoops = id + 1 - wordFirstVec.size();
83 | for(int i = 0;i < numLoops; i++)
84 | wordFirstVec.push_back(vector());
85 | }
86 | wordFirstVec[id].push_back(docId);
87 | }
88 | outFileDocIdx << docId << " " << numTokens << endl;
89 |
90 | docLengthVec.push_back(docLength);
91 | docId ++;
92 | }
93 | inputFile.close();
94 |
95 | printf("write vocabulary ...\n");
96 | //write vocabulary
97 | for(int i = 0;i < vocab.wordList.size();i++){
98 | outFileVocab << vocab.wordList[i].token << " " << vocab.wordList[i].id << endl;
99 | }
100 | outFileVocab.close();
101 |
102 | //cout << "wordnumbers:" << vocab.word_list.size() << endl;
103 | //cout << "wordFirstVec:" << wordFirstVec.size() << endl;
104 |
105 | printf("mapping chunks ...\n");
106 | //decide the doc -> chunk mapping
107 | long long tokenPerChunk = (numTokens + numChunks - 1)/numChunks;
108 | vector dataChunkVec;
109 | vector docToChunkVec;
110 | docId = 0;
111 | printf("numChunks:%lld\n", numChunks);
112 | printf("numTokens:%lld\n", numTokens);
113 | printf("perChunk :%lld\n", tokenPerChunk);
114 | for(int chunkId = 0; chunkId < numChunks; chunkId ++){
115 | dataChunkVec.push_back(DataChunk(vocab.wordList.size(), numChunks, chunkId, argv[2]));
116 |
117 | long long tmpChunkSize = 0;
118 | dataChunkVec[chunkId].docIdStart = docId;
119 | while(docId < docLengthVec.size()){
120 | tmpChunkSize += docLengthVec[docId];
121 | docToChunkVec.push_back(chunkId);
122 | docId ++;
123 |
124 | if(tmpChunkSize >= tokenPerChunk)break;
125 | }
126 | dataChunkVec[chunkId].docIdEnd = docId;
127 | dataChunkVec[chunkId].chunkTokenSize = tmpChunkSize;
128 | dataChunkVec[chunkId].chunkDocSize =
129 | dataChunkVec[chunkId].docIdEnd - dataChunkVec[chunkId].docIdStart;
130 |
131 | outFileChunk << chunkId << " " << docId << endl;
132 | }
133 |
134 | for(int chunkId = 0;chunkId < numChunks; chunkId ++){
135 | printf("----\n");
136 | printf("chunkId:%d\n", chunkId);
137 | printf("numWords:%d\n", dataChunkVec[chunkId].numWords);
138 | printf("doc range:%d - %d\n", dataChunkVec[chunkId].docIdStart, dataChunkVec[chunkId].docIdEnd);
139 | printf("chunkSize:%lld\n", dataChunkVec[chunkId].chunkTokenSize);
140 | printf("%s\n", dataChunkVec[chunkId].outFileIdxName.c_str());
141 | printf("%s\n", dataChunkVec[chunkId].outFileDataName.c_str());
142 | }
143 |
144 | printf("chunk partitioning ...\n");
145 | //distribute the data to each chunk
146 | long long offset = 0;
147 | for(int wordId = 0; wordId < wordFirstVec.size(); wordId ++){
148 |
149 | for(int localTokenId = 0; localTokenId < wordFirstVec[wordId].size(); localTokenId ++){
150 |
151 | int docId = wordFirstVec[wordId][localTokenId];
152 | int chunkId = docToChunkVec[docId];
153 |
154 | dataChunkVec[chunkId].wordFirstVec[wordId].push_back(docId);
155 | }
156 | }
157 |
158 | printf("write chunks ...\n");
159 | //write
160 | for(int chunkId = 0;chunkId < numChunks;chunkId++)
161 | {
162 | printf("writing chunk %d ...\n", chunkId);
163 | dataChunkVec[chunkId].writeChunk();
164 | }
165 |
166 | outFileChunk.close();
167 |
168 | return 0;
169 | }
--------------------------------------------------------------------------------