├── README.md ├── src_culda ├── .gitignore ├── Makefile ├── kernel │ ├── lda_phi_kernel.cu │ ├── lda_phi_kernel.h │ ├── lda_theta_kernel.back.cu │ ├── lda_theta_kernel.back.h │ ├── lda_theta_kernel.cu │ ├── lda_theta_kernel.h │ ├── lda_train_kernel.cu │ └── lda_train_kernel.h ├── main.cpp ├── model │ ├── culda_argument.h │ ├── doc.cpp │ ├── doc.h │ ├── doc_chunk.cpp │ ├── doc_chunk.h │ ├── model_phi.cpp │ ├── model_phi.h │ ├── model_phi_gpu.cpp │ ├── model_phi_gpu.h │ ├── model_theta.cpp │ ├── model_theta.h │ ├── model_theta_chunk.cpp │ ├── model_theta_chunk.h │ ├── vocab.cpp │ └── vocab.h └── train │ ├── MultiChunkMultiGPUequal.h │ ├── SingleChunkSingleGPU.h │ ├── lda_train.cu │ └── lda_train.h └── src_format ├── .gitignore ├── Makefile ├── data_chunk.h └── format.cpp /README.md: -------------------------------------------------------------------------------- 1 | # CuLDA_CGS 2 | 3 | CuLDA_CGS is GPU solution for CGS-based LDA sampling. It's efficient and is able to achieve 686M tokens/sec. To the best of our knowledge, it's the first LDA solution that support GPUs. 4 | 5 | 6 | ## Input Data Preparation 7 | ./src_format contantions a program to transform to text corpus to the input format of CuLDA_CGS. The transformed data format is more efficient for subsequent processing and partitioned to multiple chunks to support multi-GPU scaling. 8 | 9 | Run Command "make" in the directory and use the following command to transform the data: 10 | 11 | ./format input output_prefix numChunks[default=1] 12 | 13 | The input format of ./format is like: 14 | 15 | doc-name1 token1 token2 token3\n 16 | doc-name2 token4 token5 token6\n 17 | ... 18 | 19 | Tokens are separated by space, documents are separated by line. 20 | 21 | ## Compile and Run CuLDA_CGS 22 | Everything about CuLDA_CGS is in ./src_culda. It does not relies on any 3rd party denpendency. What you need is only a CUDA environment and a CUDA-enabled GPU. 23 | 24 | Before you run command "make" in the directory, remember to change CXX_FLAG to your targeted architecture and change CUDA_INSTALL_PATH to your CUDA directory. 25 | 26 | Then you can run ./culda for LDA sampling, the usage is: 27 | 28 | ./culda [options] 29 | 30 | Possible options
31 | 32 | -g
33 | -k : currently only support 1024
34 | -t
35 | -s : it has been deprecated
36 | -a : 50/1024 for our tested data sets
37 | -b : 0.01 for our tested data sets
38 | -c : must be equal with -g, and must be consistency with the specified chunk number in the data prepration stage
39 | -i : Same with the output_prefix in the data preparation stage.
40 | -o : It's not used now. Rewrite ModelPhi::savePhi and ModelTheta::saveTheta as you need it.
41 | 42 | CuLDA_CGS outputs the number of processed token per sec and the loglikelyhood after each iteration. 43 | 44 | -------------------------------------------------------------------------------- /src_culda/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | culda 3 | obj/* 4 | Theta* 5 | theta* 6 | Phi* 7 | phi* 8 | Topics* 9 | topics* 10 | *ini 11 | -------------------------------------------------------------------------------- /src_culda/Makefile: -------------------------------------------------------------------------------- 1 | #go! 2 | 3 | 4 | 5 | CXX = nvcc 6 | CUDA_INSTALL_PATH = /usr/local/cuda 7 | OBJECTIVE = ./obj 8 | CXX_FLAG = -arch=sm_61 9 | #-maxrregcount=32 10 | 11 | #-maxrregcount=32 12 | 13 | DEBUG_FLAG = 14 | #DEBUG_FLAG = -G -g 15 | #DEBUG_FLAG = -DDEBUGPrint 16 | 17 | culda:$(OBJECTIVE)/main.o \ 18 | $(OBJECTIVE)/lda_train_kernel.o \ 19 | $(OBJECTIVE)/lda_train.o \ 20 | $(OBJECTIVE)/lda_theta_kernel.o \ 21 | $(OBJECTIVE)/lda_phi_kernel.o \ 22 | $(OBJECTIVE)/model_theta.o \ 23 | $(OBJECTIVE)/model_theta_chunk.o \ 24 | $(OBJECTIVE)/model_phi_gpu.o \ 25 | $(OBJECTIVE)/model_phi.o \ 26 | $(OBJECTIVE)/vocab.o \ 27 | $(OBJECTIVE)/doc.o \ 28 | $(OBJECTIVE)/doc_chunk.o 29 | $(CXX) $(OBJECTIVE)/main.o \ 30 | $(OBJECTIVE)/model_theta.o \ 31 | $(OBJECTIVE)/model_theta_chunk.o \ 32 | $(OBJECTIVE)/model_phi.o \ 33 | $(OBJECTIVE)/model_phi_gpu.o \ 34 | $(OBJECTIVE)/vocab.o \ 35 | $(OBJECTIVE)/doc.o \ 36 | $(OBJECTIVE)/doc_chunk.o \ 37 | $(OBJECTIVE)/lda_train_kernel.o \ 38 | $(OBJECTIVE)/lda_train.o \ 39 | $(OBJECTIVE)/lda_theta_kernel.o \ 40 | $(OBJECTIVE)/lda_phi_kernel.o \ 41 | -o culda \ 42 | -L $(CUDA_INSTALL_PATH)/lib64 \ 43 | $(CXX_FLAG) $(DEBUG_FLAG) 44 | 45 | 46 | $(OBJECTIVE)/main.o:\ 47 | main.cpp \ 48 | ./model/model_theta.h \ 49 | ./model/vocab.h \ 50 | ./model/doc.h \ 51 | ./model/doc_chunk.h \ 52 | ./model/culda_argument.h \ 53 | ./train/lda_train.h 54 | $(CXX) -c main.cpp \ 55 | -o $(OBJECTIVE)/main.o \ 56 | -I $(CUDA_INSTALL_PATH)/include \ 57 | -L $(CUDA_INSTALL_PATH)/lib64 \ 58 | $(CXX_FLAG) $(DEBUG_FLAG) 59 | 60 | $(OBJECTIVE)/doc.o:\ 61 | ./model/doc.cpp \ 62 | ./model/doc.h \ 63 | ./model/doc_chunk.h 64 | $(CXX) -c ./model/doc.cpp \ 65 | -o $(OBJECTIVE)/doc.o \ 66 | $(CXX_FLAG) $(DEBUG_FLAG) 67 | 68 | $(OBJECTIVE)/doc_chunk.o:\ 69 | ./model/doc_chunk.cpp \ 70 | ./model/doc_chunk.h \ 71 | ./model/culda_argument.h 72 | $(CXX) -c ./model/doc_chunk.cpp \ 73 | -o $(OBJECTIVE)/doc_chunk.o \ 74 | $(CXX_FLAG) $(DEBUG_FLAG) 75 | 76 | $(OBJECTIVE)/vocab.o:\ 77 | ./model/vocab.cpp \ 78 | ./model/vocab.h 79 | $(CXX) -c ./model/vocab.cpp \ 80 | -o $(OBJECTIVE)/vocab.o \ 81 | $(CXX_FLAG) $(DEBUG_FLAG) 82 | 83 | $(OBJECTIVE)/lda_theta_kernel.o:\ 84 | ./kernel/lda_theta_kernel.cu \ 85 | ./kernel/lda_theta_kernel.h \ 86 | ./kernel/lda_train_kernel.h \ 87 | ./model/culda_argument.h 88 | $(CXX) -c ./kernel/lda_theta_kernel.cu \ 89 | -o $(OBJECTIVE)/lda_theta_kernel.o \ 90 | $(CXX_FLAG) -Xptxas -v $(DEBUG_FLAG) 91 | 92 | $(OBJECTIVE)/lda_train_kernel.o:\ 93 | ./kernel/lda_train_kernel.cu \ 94 | ./kernel/lda_train_kernel.h \ 95 | ./model/culda_argument.h 96 | $(CXX) -c ./kernel/lda_train_kernel.cu \ 97 | -o $(OBJECTIVE)/lda_train_kernel.o \ 98 | $(CXX_FLAG) -Xptxas -v $(DEBUG_FLAG) 99 | 100 | $(OBJECTIVE)/lda_train.o:\ 101 | ./train/lda_train.cu \ 102 | ./train/lda_train.h \ 103 | ./train/MultiChunkMultiGPUequal.h \ 104 | ./train/SingleChunkSingleGPU.h \ 105 | ./kernel/lda_train_kernel.h \ 106 | ./model/doc.h \ 107 | ./model/doc_chunk.h \ 108 | ./model/vocab.h \ 109 | ./model/model_theta.h \ 110 | ./model/model_theta_chunk.h \ 111 | ./model/model_phi.h \ 112 | ./model/model_phi_gpu.h\ 113 | ./model/culda_argument.h 114 | $(CXX) -c ./train/lda_train.cu \ 115 | -o $(OBJECTIVE)/lda_train.o \ 116 | $(CXX_FLAG) $(DEBUG_FLAG) 117 | 118 | $(OBJECTIVE)/lda_phi_kernel.o:\ 119 | ./kernel/lda_phi_kernel.cu \ 120 | ./kernel/lda_phi_kernel.h \ 121 | ./kernel/lda_train_kernel.h \ 122 | ./model/culda_argument.h 123 | $(CXX) -c ./kernel/lda_phi_kernel.cu \ 124 | -o $(OBJECTIVE)/lda_phi_kernel.o \ 125 | $(CXX_FLAG) $(DEBUG_FLAG) 126 | 127 | $(OBJECTIVE)/model_phi_gpu.o:\ 128 | ./model/model_phi_gpu.cpp \ 129 | ./model/model_phi_gpu.h \ 130 | ./model/doc.h \ 131 | ./model/doc_chunk.h \ 132 | ./model/vocab.h \ 133 | ./model/culda_argument.h \ 134 | ./kernel/lda_train_kernel.h 135 | $(CXX) -c ./model/model_phi_gpu.cpp \ 136 | -o $(OBJECTIVE)/model_phi_gpu.o \ 137 | $(CXX_FLAG) $(DEBUG_FLAG) 138 | 139 | $(OBJECTIVE)/model_phi.o:\ 140 | ./model/model_phi.cpp \ 141 | ./model/model_phi.h \ 142 | ./model/doc.h \ 143 | ./model/doc_chunk.h \ 144 | ./model/vocab.h \ 145 | ./model/culda_argument.h \ 146 | ./model/model_phi_gpu.h \ 147 | ./kernel/lda_train_kernel.h 148 | $(CXX) -c ./model/model_phi.cpp \ 149 | -o $(OBJECTIVE)/model_phi.o \ 150 | $(CXX_FLAG) $(DEBUG_FLAG) 151 | 152 | $(OBJECTIVE)/model_theta_chunk.o:\ 153 | ./model/model_theta_chunk.cpp \ 154 | ./model/model_theta_chunk.h \ 155 | ./model/doc.h \ 156 | ./model/doc_chunk.h \ 157 | ./model/vocab.h \ 158 | ./model/culda_argument.h \ 159 | ./kernel/lda_train_kernel.h 160 | $(CXX) -c ./model/model_theta_chunk.cpp \ 161 | -o $(OBJECTIVE)/model_theta_chunk.o \ 162 | $(CXX_FLAG) $(DEBUG_FLAG) 163 | 164 | $(OBJECTIVE)/model_theta.o:\ 165 | ./model/model_theta.cpp \ 166 | ./model/model_theta.h \ 167 | ./model/doc.h \ 168 | ./model/doc_chunk.h \ 169 | ./model/vocab.h \ 170 | ./model/culda_argument.h \ 171 | ./model/model_theta_chunk.h \ 172 | ./model/culda_argument.h \ 173 | ./kernel/lda_train_kernel.h 174 | $(CXX) -c ./model/model_theta.cpp \ 175 | -o $(OBJECTIVE)/model_theta.o \ 176 | $(CXX_FLAG) $(DEBUG_FLAG) 177 | 178 | 179 | clean: 180 | rm -f culda 181 | rm -f *.o 182 | rm -f obj/* 183 | -------------------------------------------------------------------------------- /src_culda/kernel/lda_phi_kernel.cu: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | 4 | #include "lda_train_kernel.h" 5 | #include "lda_phi_kernel.h" 6 | 7 | /* phihead comput kernels */ 8 | __global__ void LDAcomputePhiHeadKernel( 9 | int k, 10 | float beta, 11 | int numWords, 12 | int numWordsPerWorker, 13 | PHITYPE *phiTopicWordShort, 14 | int *phiTopic, 15 | half *phiHead) 16 | { 17 | int tid = threadIdx.x + blockDim.x*blockIdx.x; 18 | int workerId = tid/32; 19 | int laneId = tid%32; 20 | int wordId = workerId; 21 | 22 | if(workerId >= numWords)return; 23 | 24 | for(int tmpk = laneId; tmpk < k; tmpk += 32){ 25 | 26 | float tmpHead = (phiTopicWordShort[wordId*k + tmpk] + beta)/(phiTopic[tmpk] + beta*numWords); 27 | 28 | //if(tmpk = 1024)tmpHead *=1.01; 29 | phiHead[wordId*k + tmpk] = __float2half(tmpHead); 30 | } 31 | } 32 | 33 | __global__ void LDAcheckPhiHeadKernel( 34 | int k, 35 | int numWords, 36 | half *phiHead) 37 | { 38 | 39 | int tid = threadIdx.x + blockDim.x*blockIdx.x; 40 | int wordId = tid/1024; 41 | int tmpk = tid%1024; 42 | 43 | float tmp = __half2float(phiHead[wordId*k + tmpk]); 44 | if(tmp < 0){ 45 | printf("phihead check error:wordid(%d), k(%d), head(%.6f), index(%d)\n", wordId, tmpk, tmp, wordId*k + tmpk); 46 | } 47 | } 48 | 49 | void LDAComputePhiHeadAPI( 50 | int k, 51 | float beta, 52 | int numWords, 53 | PHITYPE *phiTopicWordShort, 54 | int *phiTopic, 55 | half *phiHead, 56 | cudaStream_t stream) 57 | { 58 | 59 | //printf("call LDAComputePhiHeadAPI ...\n"); 60 | LDAcomputePhiHeadKernel<<<(numWords+3)/4,128,0, stream>>>( 61 | k, 62 | beta, 63 | numWords, 64 | 1, 65 | phiTopicWordShort, 66 | phiTopic, 67 | phiHead 68 | ); 69 | 70 | //printf("LDAcheckPhiHeadKernel ...\n"); 71 | //LDAcheckPhiHeadKernel<<>>(k, numWords, phiHead); 72 | } 73 | 74 | /* phi update kernels */ 75 | 76 | __global__ void LDAUpdatePhiKernel( 77 | int k, 78 | int numWords, 79 | long long *wordIndices, 80 | short *wordTopics, 81 | PHITYPE *phiTopicWordShort, 82 | int *phiTopicWordSub, 83 | int *phiTopic, 84 | int numWorkers) 85 | { 86 | 87 | int tid = threadIdx.x + blockDim.x*blockIdx.x; 88 | 89 | int workerId = tid/32; 90 | int laneId = tid%32; 91 | 92 | if(workerId >= numWorkers) return; 93 | 94 | for(int wordId = workerId; wordId < numWords; wordId += numWorkers){ 95 | 96 | long long tokenStart = __ldg(&wordIndices[wordId]); 97 | long long tokenEnd = __ldg(&wordIndices[wordId + 1]); 98 | 99 | //clean 100 | for(int offset = laneId; offset < k;offset += 32) 101 | phiTopicWordSub[workerId*k + offset] = 0; 102 | 103 | //add 104 | for(long long tokenIdx = tokenStart + laneId; 105 | tokenIdx < tokenEnd; 106 | tokenIdx += 32) //iterate over tokens 107 | { 108 | int tmpK = __ldg(&wordTopics[tokenIdx]); 109 | atomicAdd(&(phiTopicWordSub[workerId*k + tmpK]),1); 110 | atomicAdd(&(phiTopic[tmpK]),1); 111 | } 112 | 113 | //transform 114 | for(int offset = laneId; offset < k;offset += 32) 115 | phiTopicWordShort[wordId*k + offset] = phiTopicWordSub[workerId*k + offset]; 116 | } 117 | } 118 | 119 | /* 120 | __global__ void LDAPhiCheckKernel( 121 | int k, 122 | int numWords, 123 | PHITYPE *phiTopicWordShort) 124 | { 125 | int tid = threadIdx.x + blockDim.x*blockIdx.x; 126 | if(tid > k*numWords)return; 127 | 128 | if(phiTopicWordShort[tid] < 0) 129 | { 130 | printf("phi check error: word(%d), k(%d), int(%d), short(%d)\n", 131 | tid/k, tid%k, phiTopicWordShort[tid], phiTopicWordShort[tid]); 132 | } 133 | } 134 | */ 135 | 136 | void LDAUpdatePhiAPI( 137 | int k, 138 | int numWords, 139 | long long *wordIndices, 140 | short *wordTopics, 141 | PHITYPE *phiTopicWordShort, 142 | int *phiTopicWordSub, 143 | int *phiTopic, 144 | cudaStream_t stream) 145 | { 146 | 147 | LDAUpdatePhiKernel<<<(UpdateNumWorkers+3)/4, 128, 0,stream>>>( 148 | k, 149 | numWords, 150 | wordIndices, 151 | wordTopics, 152 | phiTopicWordShort, 153 | phiTopicWordSub, 154 | phiTopic, 155 | UpdateNumWorkers 156 | ); 157 | 158 | //LDAPhiCheckKernel<<<(k*numWords + 127)/128, 128,0, stream>>>(k,numWords,phiTopicWordShort); 159 | } 160 | 161 | /* MultiGPU Reduce Kernels */ 162 | __global__ void LDAUpdatePhiReduceKernelShort( 163 | int k, 164 | int numWords, 165 | PHITYPE *phiTopicWordShort, 166 | PHITYPE *phiTopicWordShortCopy) 167 | { 168 | int tid = threadIdx.x + blockIdx.x*blockDim.x; 169 | if(tid < k*numWords) phiTopicWordShort[tid] += phiTopicWordShortCopy[tid]; 170 | } 171 | 172 | __global__ void LDAUpdatePhiReduceKernelInt( 173 | int k, 174 | int numWords, 175 | int *phiTopic, 176 | int *phiTopicCopy) 177 | { 178 | int tid = threadIdx.x + blockIdx.x*blockDim.x; 179 | //if(tid == 0) 180 | // printf("phiTopic[0]:%d, phiTopicShort[0]:%d\n", phiTopic[0], phiTopicCopy[0]); 181 | 182 | if(tid < k) phiTopic[tid] += phiTopicCopy[tid]; 183 | } 184 | 185 | void LDAUpdatePhiReduceAPI( 186 | int k, 187 | int numWords, 188 | PHITYPE *phiTopicWordShort, 189 | PHITYPE *phiTopicWordShortCopy, 190 | int *phiTopic, 191 | int *phiTopicCopy, 192 | cudaStream_t stream) 193 | { 194 | 195 | LDAUpdatePhiReduceKernelShort<<<(k*numWords + 127)/128,128,0,stream>>>( 196 | k, 197 | numWords, 198 | phiTopicWordShort, 199 | phiTopicWordShortCopy 200 | ); 201 | 202 | LDAUpdatePhiReduceKernelInt<<<(k + 127)/128,128,0,stream>>>( 203 | k, 204 | numWords, 205 | phiTopic, 206 | phiTopicCopy 207 | ); 208 | } -------------------------------------------------------------------------------- /src_culda/kernel/lda_phi_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _LDA_PHI_KERNEL_H_ 2 | #define _LDA_PHI_KERNEL_H_ 3 | 4 | #include 5 | #include "../model/culda_argument.h" 6 | 7 | 8 | /* phihead comput kernels */ 9 | __global__ void LDAcomputePhiHeadKernel( 10 | int k, 11 | float beta, 12 | int numWords, 13 | int numWordsPerWorker, 14 | PHITYPE *phiTopicWordShort, 15 | int *phiTopic, 16 | half *phiHead 17 | ); 18 | 19 | __global__ void LDAcheckPhiHeadKernel( 20 | int k, 21 | int numWords, 22 | half *phiHead 23 | ); 24 | 25 | void LDAComputePhiHeadAPI( 26 | int k, 27 | float beta, 28 | int numWords, 29 | PHITYPE *phiTopicWordShort, 30 | int *phiTopic, 31 | half *phiHead, 32 | cudaStream_t stream=0 33 | ); 34 | 35 | /* phi update kernels */ 36 | __global__ void LDAUpdatePhiKernel( 37 | int k, 38 | int numWords, 39 | long long *wordIndices, 40 | short *wordTopics, 41 | PHITYPE *phiTopicWordShort, 42 | int *phiTopicWordSub, 43 | int *phiTopic, 44 | int numWorkers 45 | ); 46 | 47 | void LDAUpdatePhiAPI( 48 | int k, 49 | int numWords, 50 | long long *wordIndices, 51 | short *wordTopics, 52 | PHITYPE *phiTopicWordShort, 53 | int *phiTopicWordSub, 54 | int *phiTopic, 55 | cudaStream_t stream=0 56 | ); 57 | 58 | /* 59 | __global__ void LDAPhiCheckKernel( 60 | int k, 61 | int numWords, 62 | PHITYPE *phiTopicWordShort); 63 | */ 64 | 65 | /* MultiGPU Reduce Kernels */ 66 | __global__ void LDAUpdatePhiReduceKernelShort( 67 | int k, 68 | int numWords, 69 | PHITYPE *phiTopicWordShort, 70 | PHITYPE *phiTopicWordShortCopy 71 | ); 72 | 73 | __global__ void LDAUpdatePhiReduceKernelInt( 74 | int k, 75 | int numWords, 76 | int *phiTopic, 77 | int *phiTopicCopy 78 | ); 79 | 80 | void LDAUpdatePhiReduceAPI( 81 | int k, 82 | int numWords, 83 | PHITYPE *phiTopicWordShort, 84 | PHITYPE *phiTopicWordShortCopy, 85 | int *phiTopic, 86 | int *phiTopicCopy, 87 | cudaStream_t stream=0 88 | ); 89 | 90 | #endif -------------------------------------------------------------------------------- /src_culda/kernel/lda_theta_kernel.back.cu: -------------------------------------------------------------------------------- 1 | #include "lda_train_kernel.h" 2 | #include "lda_theta_kernel.h" 3 | 4 | 5 | __global__ void LDAUpdateThetaIncreaseKernel( 6 | int k, 7 | int numDocs, 8 | int docIdStart, 9 | int chunkNumDocs, 10 | long long *wordIndices, 11 | int *wordTokens, 12 | short *wordTopics, 13 | long long *docRevIndices, 14 | TokenIdxType *docRevIdx, 15 | short *thetaA, 16 | int *thetaCurIA, 17 | int *thetaMaxIA, 18 | short *thetaJA, 19 | int *denseTheta) 20 | { 21 | int tid = threadIdx.x + blockIdx.x*blockDim.x; 22 | int BlockSize = blockDim.x; 23 | 24 | int workerId = tid/BlockSize; 25 | int laneId = tid%BlockSize; 26 | 27 | if(workerId >= chunkNumDocs)return; 28 | 29 | 30 | int docId = workerId + docIdStart; 31 | 32 | 33 | for(long long idx = docRevIndices[docId] + laneId; 34 | idx < docRevIndices[docId + 1]; 35 | idx += BlockSize){ 36 | int topic = wordTopics[docRevIdx[idx]]; 37 | atomicAdd(&(denseTheta[(docId - docIdStart)*k + topic]), 1); 38 | } 39 | 40 | } 41 | 42 | __global__ void LDAUpdateThetaAlignKernel( 43 | int k, 44 | int numDocs, 45 | int docIdStart, 46 | int chunkNumDocs, 47 | long long *wordIndices, 48 | int *wordTokens, 49 | short *wordTopics, 50 | short *thetaA, 51 | int *thetaCurIA, 52 | int *thetaMaxIA, 53 | short *thetaJA, 54 | int *denseTheta) 55 | { 56 | 57 | int laneId = threadIdx.x%32; 58 | int localId = threadIdx.x/32; 59 | int tid = threadIdx.x + blockIdx.x*blockDim.x; 60 | int workerId = tid/32; 61 | 62 | if(workerId >= chunkNumDocs)return; 63 | 64 | //if( workerId <= 1000 || workerId >= 5000)return; 65 | 66 | volatile __shared__ int shaPrefixSum[64]; 67 | 68 | int docId = docIdStart + workerId; 69 | 70 | //if(laneId == 0)printf("docId:%d, IAStart:%d, IACurEnd:%d\n", docId, thetaMaxIA[docId], thetaMaxIA[docId + 1]); 71 | int IAStart = thetaMaxIA[docId]; 72 | //compute 73 | int tmpPrefixSum = 0; 74 | for(int i = laneId;i < k;i += 32){ 75 | 76 | //read 77 | int tmpVal = denseTheta[(docId - docIdStart)*k + i]; 78 | int tmpBin = tmpVal > 0; 79 | shaPrefixSum[localId*32 + laneId] = tmpBin; 80 | 81 | //prefix sum 82 | if(laneId >= 1) 83 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 1] + shaPrefixSum[localId*32 + laneId]; 84 | if(laneId >= 2) 85 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 2] + shaPrefixSum[localId*32 + laneId]; 86 | if(laneId >= 4) 87 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 4] + shaPrefixSum[localId*32 + laneId]; 88 | if(laneId >= 8) 89 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 8] + shaPrefixSum[localId*32 + laneId]; 90 | if(laneId >= 16) 91 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 16] + shaPrefixSum[localId*32 + laneId]; 92 | 93 | 94 | // 95 | //debug 96 | //if(laneId == 0) 97 | // printf("old tmpPrefixSum:%d\n", tmpPrefixSum); 98 | //printf("laneId:%2d, denseTheta:%d, tmpVal:%d, tmpBin:%d, prefix:%d\n", 99 | // laneId, denseTheta[docId*k + i], tmpVal, tmpBin, shaPrefixSum[laneId]); 100 | 101 | //write 102 | 103 | 104 | int offset = tmpPrefixSum + shaPrefixSum[localId*32 + laneId] - 1; 105 | 106 | 107 | if(tmpVal > 0){ 108 | 109 | //printf("blockid:%5d, threadIdx.x:%4d, IAStart + offset:%lld\n", blockIdx.x, threadIdx.x, IAStart + offset); 110 | thetaA[IAStart + offset] = tmpVal; 111 | thetaJA[IAStart + offset] = i; 112 | } 113 | 114 | tmpPrefixSum += shaPrefixSum[localId*32 + 31]; 115 | 116 | 117 | //debug 118 | //if(laneId == 0) 119 | // printf("new tmpPrefixSum:%d\n", tmpPrefixSum); 120 | //if(laneId == 0) printf("-------------------------------\n"); 121 | 122 | } 123 | 124 | 125 | 126 | if(laneId == 0){ 127 | //printf("docId:%d\n", docId); 128 | thetaCurIA[docId] = IAStart + ((tmpPrefixSum + 31)/32*32); 129 | } 130 | 131 | 132 | 133 | 134 | //print for debug 135 | //if(laneId == 0){ 136 | // for(int i = 0;i < 32;i ++) 137 | // { 138 | // printf("%4d:",i); 139 | // for(int j = 0;j < 32; j++) 140 | // printf("%d ", denseTheta[docId*k + i*32 + j]); 141 | // printf("\n"); 142 | // } 143 | // for(int i = thetaMaxIA[docId]; i < thetaMaxIA[docId + 1];i++){ 144 | // printf("%d,JA(%d), A(%d)\n", i, thetaJA[i], thetaA[i]); 145 | // } 146 | //} 147 | 148 | //break; 149 | 150 | 151 | } 152 | 153 | 154 | 155 | void LDAUpdateThetaAPI( 156 | int k, 157 | int numDocs, 158 | int docIdStart, 159 | int chunkNumDocs, 160 | long long *wordIndices, 161 | int *wordTokens, 162 | short *wordTopics, 163 | long long *docRevIndices, 164 | TokenIdxType *docRevIdx, 165 | short *thetaA, 166 | int *thetaCurIA, 167 | int *thetaMaxIA, 168 | short *thetaJA, 169 | int *denseTheta, 170 | cudaStream_t stream) 171 | { 172 | 173 | cudaMemsetAsync(denseTheta, 0, sizeof(int)*chunkNumDocs*k, stream); 174 | 175 | LDAUpdateThetaIncreaseKernel<<>>( 176 | k, 177 | numDocs, 178 | docIdStart, 179 | chunkNumDocs, 180 | wordIndices, 181 | wordTokens, 182 | wordTopics, 183 | docRevIndices, 184 | docRevIdx, 185 | thetaA, 186 | thetaCurIA, 187 | thetaMaxIA, 188 | thetaJA, 189 | denseTheta); 190 | 191 | cudaDeviceSynchronize(); 192 | gpuErr(cudaPeekAtLastError()); 193 | 194 | 195 | //printf("chunkNumDocs:%d\n",chunkNumDocs); 196 | 197 | //LDAUpdateThetaAlignKernel<<<(chunkNumDocs+1)/2, 64, 0, stream>>>( 198 | LDAUpdateThetaAlignKernel<<<(chunkNumDocs+1)/2, 64, 0, stream>>>( 199 | k, 200 | numDocs, 201 | docIdStart, 202 | chunkNumDocs, 203 | wordIndices, 204 | wordTokens, 205 | wordTopics, 206 | thetaA, 207 | thetaCurIA, 208 | thetaMaxIA, 209 | thetaJA, 210 | denseTheta 211 | ); 212 | 213 | cudaDeviceSynchronize(); 214 | cudaPeekAtLastError(); 215 | 216 | //sleep(10); 217 | //cudaDeviceSynchronize(); 218 | gpuErr(cudaPeekAtLastError()); 219 | 220 | //exit(0); 221 | } -------------------------------------------------------------------------------- /src_culda/kernel/lda_theta_kernel.back.h: -------------------------------------------------------------------------------- 1 | #ifndef _LDA_THETA_KERNEL_H_ 2 | #define _LDA_THETA_KERNEL_H_ 3 | 4 | __global__ void LDAUpdateThetaIncreaseKernel( 5 | int k, 6 | int numDocs, 7 | int docIdStart, 8 | int chunkNumDocs, 9 | long long *wordIndices, 10 | int *wordTokens, 11 | short *wordTopics, 12 | long long *docRevIndices, 13 | TokenIdxType *docRevIdx, 14 | short *thetaA, 15 | int *thetaCurIA, 16 | int *thetaMaxIA, 17 | short *thetaJA, 18 | int *denseTheta); 19 | 20 | __global__ void LDAUpdateThetaAlignKernel( 21 | int k, 22 | int numDocs, 23 | int docIdStart, 24 | int chunkNumDocs, 25 | long long *wordIndices, 26 | int *wordTokens, 27 | short *wordTopics, 28 | short *thetaA, 29 | int *thetaCurIA, 30 | int *thetaMaxIA, 31 | short *thetaJA, 32 | int *denseTheta); 33 | 34 | void LDAUpdateThetaAPI( 35 | int k, 36 | int numDocs, 37 | int docIdStart, 38 | int chunkNumDocs, 39 | long long *wordIndices, 40 | int *wordTokens, 41 | short *wordTopics, 42 | long long *docRevIndices, 43 | TokenIdxType *docRevIdx, 44 | short *thetaA, 45 | int *thetaCurIA, 46 | int *thetaMaxIA, 47 | short *thetaJA, 48 | int *denseTheta, 49 | cudaStream_t stream = 0); 50 | 51 | #endif -------------------------------------------------------------------------------- /src_culda/kernel/lda_theta_kernel.cu: -------------------------------------------------------------------------------- 1 | #include "lda_train_kernel.h" 2 | #include "lda_theta_kernel.h" 3 | 4 | 5 | __global__ void LDAUpdateThetaKernel( 6 | int k, 7 | int numDocs, 8 | int chunkNumDocs, 9 | int docIdStart, 10 | int docIdEnd, 11 | long long *wordIndices, 12 | int *wordTokens, 13 | short *wordTopics, 14 | long long *docRevIndices, 15 | TokenIdxType *docRevIdx, 16 | short *thetaA, 17 | int *thetaCurIA, 18 | int *thetaMaxIA, 19 | short *thetaJA, 20 | int *denseTheta, 21 | int numThetaWorkers 22 | ) 23 | { 24 | volatile __shared__ int shaPrefixSum[64]; 25 | 26 | int tid = threadIdx.x + blockIdx.x*blockDim.x; 27 | int workerId = tid/32; 28 | int laneId = threadIdx.x%32; 29 | int localId = threadIdx.x/32; 30 | 31 | if(workerId >= numThetaWorkers)return; 32 | 33 | for(int iteDocId = docIdStart + workerId; 34 | iteDocId < docIdEnd; 35 | iteDocId += numThetaWorkers){ 36 | 37 | //clean the array 38 | int startDenseIdx = workerId*k; 39 | int endDenseIdx = workerId*k + k; 40 | for(int denseIdx = startDenseIdx + laneId; denseIdx < endDenseIdx; denseIdx += 32) 41 | denseTheta[denseIdx] = 0; 42 | 43 | //generate the dense array 44 | for(long long idx = docRevIndices[iteDocId] + laneId; 45 | idx < docRevIndices[iteDocId + 1]; 46 | idx += 32){ 47 | int topic = wordTopics[docRevIdx[idx]]; 48 | atomicAdd(&(denseTheta[startDenseIdx + topic]), 1); 49 | } 50 | 51 | //generate the sparse array 52 | int IAStart = thetaMaxIA[iteDocId]; 53 | int tmpPrefixSum = 0; 54 | 55 | for(int i = laneId; i < k;i += 32){ 56 | 57 | //read 58 | int tmpVal = denseTheta[startDenseIdx + i]; 59 | int tmpBin = tmpVal > 0; 60 | shaPrefixSum[localId*32 + laneId] = tmpBin; 61 | 62 | //prefixsum 63 | if(laneId >= 1) 64 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 1] + shaPrefixSum[localId*32 + laneId]; 65 | if(laneId >= 2) 66 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 2] + shaPrefixSum[localId*32 + laneId]; 67 | if(laneId >= 4) 68 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 4] + shaPrefixSum[localId*32 + laneId]; 69 | if(laneId >= 8) 70 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 8] + shaPrefixSum[localId*32 + laneId]; 71 | if(laneId >= 16) 72 | shaPrefixSum[localId*32 + laneId] = shaPrefixSum[localId*32 + laneId - 16] + shaPrefixSum[localId*32 + laneId]; 73 | 74 | 75 | int offset = tmpPrefixSum + shaPrefixSum[localId*32 + laneId] - 1; 76 | if(tmpVal > 0){ 77 | thetaA[IAStart + offset] = tmpVal; 78 | thetaJA[IAStart + offset] = i; 79 | } 80 | tmpPrefixSum += shaPrefixSum[localId*32 + 31]; 81 | } 82 | 83 | if(laneId == 0) 84 | thetaCurIA[iteDocId] = IAStart + ((tmpPrefixSum + 31)/32*32); 85 | } 86 | 87 | } 88 | 89 | void LDAUpdateThetaAPI( 90 | int k, 91 | int numDocs, 92 | int chunkNumDocs, 93 | int docIdStart, 94 | int docIdEnd, 95 | long long *wordIndices, 96 | int *wordTokens, 97 | short *wordTopics, 98 | long long *docRevIndices, 99 | TokenIdxType *docRevIdx, 100 | short *thetaA, 101 | int *thetaCurIA, 102 | int *thetaMaxIA, 103 | short *thetaJA, 104 | int *denseTheta, 105 | cudaStream_t stream 106 | ) 107 | { 108 | LDAUpdateThetaKernel<<<(UpdateNumWorkers+1)/2,64,0,stream>>>( 109 | k, 110 | numDocs, 111 | chunkNumDocs, 112 | docIdStart, 113 | docIdEnd, 114 | wordIndices, 115 | wordTokens, 116 | wordTopics, 117 | docRevIndices, 118 | docRevIdx, 119 | thetaA, 120 | thetaCurIA, 121 | thetaMaxIA, 122 | thetaJA, 123 | denseTheta, 124 | UpdateNumWorkers 125 | ); 126 | } -------------------------------------------------------------------------------- /src_culda/kernel/lda_theta_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _LDA_THETA_KERNEL_H_ 2 | #define _LDA_THETA_KERNEL_H_ 3 | 4 | #include "../model/culda_argument.h" 5 | 6 | __global__ void LDAUpdateThetaKernel( 7 | int k, 8 | int numDocs, 9 | int chunkNumDocs, 10 | int docIdStart, 11 | int docIdEnd, 12 | long long *wordIndices, 13 | int *wordTokens, 14 | short *wordTopics, 15 | long long *docRevIndices, 16 | TokenIdxType *docRevIdx, 17 | short *thetaA, 18 | int *thetaCurIA, 19 | int *thetaMaxIA, 20 | short *thetaJA, 21 | int *denseTheta, 22 | int numThetaWorkers 23 | ); 24 | 25 | void LDAUpdateThetaAPI( 26 | int k, 27 | int numDocs, 28 | int chunkNumDocs, 29 | int docIdStart, 30 | int docIdEnd, 31 | long long *wordIndices, 32 | int *wordTokens, 33 | short *wordTopics, 34 | long long *docRevIndices, 35 | TokenIdxType *docRevIdx, 36 | short *thetaA, 37 | int *thetaCurIA, 38 | int *thetaMaxIA, 39 | short *thetaJA, 40 | int *denseTheta, 41 | cudaStream_t stream = 0 42 | ); 43 | 44 | 45 | #endif -------------------------------------------------------------------------------- /src_culda/kernel/lda_train_kernel.cu: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #include "lda_train_kernel.h" 10 | #include "../model/culda_argument.h" 11 | 12 | using namespace std; 13 | 14 | __global__ void initRandState(curandState *state) 15 | { 16 | int tid = blockIdx.x*blockDim.x + threadIdx.x; 17 | curand_init(clock() + tid, tid, 0,&state[tid]); 18 | } 19 | 20 | __global__ void LDAKernelTrain( 21 | int k, //parameters 22 | float alpha, 23 | float beta, 24 | int numDocs, // corpora 25 | int numWords, 26 | long long numTokens, 27 | long long *wordIndices, // data, numWords + 1 28 | int *slotIdToWordId, // data, numSlots 29 | long long *slotIndices, // data, numSlots*2 30 | int *wordTokens, // data, numTokens 31 | short *wordTopics, // data, numTokens 32 | short *thetaA, //model, values, thetaNNZ 33 | int *thetaMaxIA, //model, offsets, numDocs + 1, 34 | int *thetaCurIA, //model, offsets, numDocs, 35 | short *thetaJA, //model, column indices, thetaNNZ 36 | int docIdStart, 37 | PHITYPE *phiTopicWord, //model, numWords*k 38 | int *phiTopic, //model, k 39 | half *phiHead, //model, numWords*k 40 | curandState *randState, 41 | int randStateSize, 42 | int GPUid, 43 | double *wordPerplexity, //numWords 44 | long long *docRevIndices) 45 | { 46 | 47 | 48 | int tid = threadIdx.x + blockIdx.x*blockDim.x; 49 | int workerId = tid/TrainBlockSize; 50 | int laneId = threadIdx.x%32; 51 | int localId = threadIdx.x/32; 52 | 53 | //samling index 54 | volatile __shared__ float prefixSumQTree[32]; 55 | volatile __shared__ float prefixSumSTree[TrainBlockSize/32][32]; 56 | volatile __shared__ float prefixSumSample[TrainBlockSize/32][32]; 57 | 58 | //cache to store phi. 59 | volatile __shared__ float phiHeadCache[1024 + ShaMemPad + 0]; 60 | 61 | int wordId = slotIdToWordId[workerId]; 62 | long long tokenStart = __ldg(&slotIndices[workerId*2]); 63 | long long tokenEnd = __ldg(&slotIndices[workerId*2 + 1]); 64 | 65 | //load phi head into cache 66 | int tmpEnd = k/32; 67 | 68 | for(int QIdx = localId; QIdx < tmpEnd; QIdx += TrainBlockSize/32){ 69 | 70 | int tmpK = QIdx*32 + laneId; 71 | float tmpVal = __half2float(phiHead[k*wordId + tmpK]); 72 | phiHeadCache[tmpK] = tmpVal; 73 | 74 | tmpVal = alpha*tmpVal; 75 | tmpVal += __shfl_down(tmpVal, 16); 76 | tmpVal += __shfl_down(tmpVal, 8); 77 | tmpVal += __shfl_down(tmpVal, 4); 78 | tmpVal += __shfl_down(tmpVal, 2); 79 | tmpVal += __shfl_down(tmpVal, 1); 80 | tmpVal = __shfl(tmpVal, 0); 81 | prefixSumQTree[QIdx] = tmpVal; 82 | } 83 | __syncthreads(); 84 | 85 | //accumulation prefixSumQTree 86 | if(localId == 0){ 87 | if(laneId >= 1) 88 | prefixSumQTree[laneId] = prefixSumQTree[laneId - 1] + prefixSumQTree[laneId]; 89 | if(laneId >= 2) 90 | prefixSumQTree[laneId] = prefixSumQTree[laneId - 2] + prefixSumQTree[laneId]; 91 | if(laneId >= 4) 92 | prefixSumQTree[laneId] = prefixSumQTree[laneId - 4] + prefixSumQTree[laneId]; 93 | if(laneId >= 8) 94 | prefixSumQTree[laneId] = prefixSumQTree[laneId - 8] + prefixSumQTree[laneId]; 95 | if(laneId >= 16) 96 | prefixSumQTree[laneId] = prefixSumQTree[laneId - 16] + prefixSumQTree[laneId]; 97 | } 98 | __syncthreads(); 99 | float Q = prefixSumQTree[31]; 100 | 101 | float sumPerplexity = 0.0; 102 | 103 | //int stateId = (workerId*TrainBlockSize/32 + localId)%randStateSize; 104 | for(int tokenIdx = tokenStart + localId; 105 | tokenIdx < tokenEnd; 106 | tokenIdx += TrainBlockSize/32) //iterate over tokens 107 | { 108 | int docId = __ldg(&wordTokens[tokenIdx]); 109 | 110 | //computing S. 111 | float S = 0; 112 | int IAStart = __ldg(&thetaMaxIA[docId]); //L1 cache 113 | int IACurEnd = __ldg(&thetaCurIA[docId]); //L1 cache 114 | prefixSumSTree[localId][laneId] = 0; 115 | 116 | for(int tmpIdx = IAStart + laneId, SIdx = 0; 117 | tmpIdx < IACurEnd; 118 | tmpIdx += 32){ 119 | 120 | int colVal = __ldg(&thetaA[tmpIdx]); //L1 cache 121 | int colK = __ldg(&thetaJA[tmpIdx]); //L1 cache 122 | //int colVal = thetaA[tmpIdx]; 123 | //int colK = thetaJA[tmpIdx]; 124 | float tmpP1k = colVal*phiHeadCache[colK]; 125 | //go reduce. 126 | tmpP1k += __shfl_down(tmpP1k, 16); 127 | tmpP1k += __shfl_down(tmpP1k, 8); 128 | tmpP1k += __shfl_down(tmpP1k, 4); 129 | tmpP1k += __shfl_down(tmpP1k, 2); 130 | tmpP1k += __shfl_down(tmpP1k, 1); 131 | tmpP1k = __shfl(tmpP1k, 0); 132 | 133 | S += tmpP1k; 134 | prefixSumSTree[localId][SIdx] = S; 135 | SIdx ++; 136 | } 137 | S = __shfl(S,0); 138 | 139 | //randomly generate u. 140 | float u; 141 | if(laneId == 0)u = curand_uniform(&(randState[workerId%randStateSize])); 142 | u = __shfl(u, 0); 143 | int newZ = 0; 144 | 145 | if(u < S/(S+Q)) 146 | { 147 | 148 | //totalS ++; 149 | //tmpClock = clock64(); 150 | 151 | float transU = u*(S+Q); 152 | 153 | float tmpSumHigh, tmpSumLow = 0.0; 154 | tmpSumHigh = prefixSumSTree[localId][laneId]; 155 | tmpSumLow = __shfl_up(tmpSumHigh, 1, 32); 156 | if(laneId == 0)tmpSumLow = 0; 157 | 158 | int voteFlag = 0; 159 | if(transU < tmpSumHigh) voteFlag = 1; 160 | int lvl1Idx = __ffs(__ballot(voteFlag)) - 1; 161 | 162 | int overflowFlag = 0; 163 | 164 | if(lvl1Idx < 0) lvl1Idx = (IACurEnd - IAStart)/32 - 1; 165 | 166 | //float originalU = transU; 167 | transU = transU - tmpSumLow; 168 | transU = __shfl(transU, lvl1Idx); 169 | 170 | int tmpIdx = IAStart + lvl1Idx*32 + laneId; 171 | int tmpNewZ = __ldg(&thetaJA[tmpIdx]); 172 | int colVal = __ldg(&thetaA[tmpIdx]); 173 | float p1k = colVal*phiHeadCache[tmpNewZ]; 174 | 175 | prefixSumSample[localId][laneId] = p1k; 176 | 177 | if(laneId >= 1) prefixSumSample[localId][laneId] = 178 | prefixSumSample[localId][laneId - 1] + prefixSumSample[localId][laneId]; 179 | if(laneId >= 2) prefixSumSample[localId][laneId] = 180 | prefixSumSample[localId][laneId - 2] + prefixSumSample[localId][laneId]; 181 | if(laneId >= 4) prefixSumSample[localId][laneId] = 182 | prefixSumSample[localId][laneId - 4] + prefixSumSample[localId][laneId]; 183 | if(laneId >= 8) prefixSumSample[localId][laneId] = 184 | prefixSumSample[localId][laneId - 8] + prefixSumSample[localId][laneId]; 185 | if(laneId >= 16)prefixSumSample[localId][laneId] = 186 | prefixSumSample[localId][laneId - 16] + prefixSumSample[localId][laneId]; 187 | 188 | float tmpSum = prefixSumSample[localId][laneId]; 189 | 190 | voteFlag = 0; 191 | if(transU < tmpSum) voteFlag = 1; 192 | int offset = __ffs(__ballot(voteFlag)) - 1; 193 | 194 | //int offset1 = offset; 195 | //offset = 31 - __clz(__ballot(colVal>0)); 196 | newZ = __shfl(tmpNewZ, offset); 197 | 198 | } 199 | else //bucket Q 200 | { 201 | 202 | float transU = (u - S/(S+Q))*(S+Q); 203 | //totalQ ++; 204 | //float originalU = transU; 205 | 206 | //level 1: decide position 207 | float tmpSumHigh, tmpSumLow = 0.0; 208 | tmpSumHigh = prefixSumQTree[laneId]; 209 | tmpSumLow = __shfl_up(tmpSumHigh, 1, 32); 210 | if(laneId == 0)tmpSumLow = 0; 211 | 212 | //voting for lvl1Idx 213 | int voteFlag = 0; 214 | if(transU < tmpSumHigh) voteFlag = 1; //voteFlag = transU < tmpSumHigh; 215 | int lvl1Idx = __ffs(__ballot(voteFlag)) - 1; 216 | if(lvl1Idx < 0) lvl1Idx = 31; 217 | 218 | 219 | 220 | float originalU = transU; 221 | transU = transU - tmpSumLow; 222 | transU = __shfl(transU, lvl1Idx); 223 | 224 | prefixSumSample[localId][laneId] = alpha*phiHeadCache[32*lvl1Idx + laneId]; 225 | 226 | // accumulation 227 | if(laneId >= 1) prefixSumSample[localId][laneId] = 228 | prefixSumSample[localId][laneId - 1] + prefixSumSample[localId][laneId]; 229 | if(laneId >= 2) prefixSumSample[localId][laneId] = 230 | prefixSumSample[localId][laneId - 2] + prefixSumSample[localId][laneId]; 231 | if(laneId >= 4) prefixSumSample[localId][laneId] = 232 | prefixSumSample[localId][laneId - 4] + prefixSumSample[localId][laneId]; 233 | if(laneId >= 8) prefixSumSample[localId][laneId] = 234 | prefixSumSample[localId][laneId - 8] + prefixSumSample[localId][laneId]; 235 | if(laneId >= 16)prefixSumSample[localId][laneId] = 236 | prefixSumSample[localId][laneId - 16] + prefixSumSample[localId][laneId]; 237 | 238 | voteFlag = 0; 239 | tmpSumLow = 0; 240 | tmpSumHigh = prefixSumSample[localId][laneId]; 241 | tmpSumLow = __shfl_up(tmpSumHigh, 1, 32); 242 | 243 | if(laneId == 0)tmpSumLow = 0; 244 | 245 | if( transU < tmpSumHigh)voteFlag = 1; //voteFlag = transU < tmpSumHigh; 246 | int lvl2Idx = __ffs(__ballot(voteFlag)) - 1; 247 | 248 | if(lvl2Idx < 0)lvl2Idx = 31; 249 | 250 | newZ = lvl1Idx*32 + lvl2Idx; 251 | 252 | //if(tmpFlag == 1)return; 253 | } 254 | 255 | //update & get perplexity 256 | if(laneId == 0){ 257 | wordTopics[tokenIdx] = newZ; 258 | sumPerplexity += log((S+Q)/(docRevIndices[docId + 1] - docRevIndices[docId] + k*alpha)); 259 | } 260 | } 261 | if(threadIdx.x%32 == 0) 262 | wordPerplexity[(threadIdx.x+blockDim.x*blockIdx.x)/32] = sumPerplexity; 263 | 264 | /* 265 | float totalTime = (clock64() - startClock)/1000000000.0; 266 | if(GPUid == 0 && laneId == 0 && localId == 0 && workerId < -1){ 267 | //printf("worker id:%5d, time: %.2fB S1time: %.2fB S2time: %.2fB Qtime: %.2fB, other: %.2fB, innerLoopTime: %.2fB\n", workerId, totalTime, S1Time, S2Time, QTime, otherTime, innerLoopTime); 268 | printf("worker id:%5d, time: %.2fB\n", workerId, totalTime); 269 | } 270 | */ 271 | } 272 | 273 | __global__ void LDATrainPerplexityReduce1(double *perplexity, double *perplexityMid, int numVals){ 274 | 275 | 276 | int numWarps = gridDim.x*blockDim.x/32; 277 | int tid = threadIdx.x + blockIdx.x*blockDim.x; 278 | int warpId = tid/32; 279 | int laneId = tid%32; 280 | 281 | 282 | int perWarpSize = ((numVals + numWarps - 1)/numWarps + 31)/32*32; 283 | int startIdx = perWarpSize*warpId + laneId; 284 | int endIdx = perWarpSize*warpId + perWarpSize; 285 | 286 | double totalProd = 0; 287 | for(long long i = startIdx;i < endIdx; i += 32){ 288 | 289 | int tmpProd = 0; 290 | if(i < numVals)tmpProd = perplexity[i]; 291 | 292 | tmpProd += __shfl_down(tmpProd, 16); 293 | tmpProd += __shfl_down(tmpProd, 8); 294 | tmpProd += __shfl_down(tmpProd, 4); 295 | tmpProd += __shfl_down(tmpProd, 2); 296 | tmpProd += __shfl_down(tmpProd, 1); 297 | 298 | totalProd += tmpProd; 299 | } 300 | 301 | if(laneId == 0) perplexityMid[warpId] = totalProd; 302 | } 303 | 304 | __global__ void LDATrainPerplexityReduce2(double *perplexityMid) 305 | { 306 | 307 | double sum = 0; 308 | for(int i = threadIdx.x; i < ReduceParameter; i += 32){ 309 | double tmpProd = perplexityMid[i]; 310 | 311 | tmpProd += __shfl_down(tmpProd, 16); 312 | tmpProd += __shfl_down(tmpProd, 8); 313 | tmpProd += __shfl_down(tmpProd, 4); 314 | tmpProd += __shfl_down(tmpProd, 2); 315 | tmpProd += __shfl_down(tmpProd, 1); 316 | 317 | sum += tmpProd; 318 | } 319 | 320 | if(threadIdx.x == 0)perplexityMid[0] = sum; 321 | } 322 | 323 | double LDATrainPerplexity(Document &doc, cudaStream_t *streams) 324 | { 325 | 326 | double tmpSum[MaxNumGPU]; 327 | double sum = 0; 328 | if(streams == NULL){ 329 | 330 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++){ 331 | 332 | cudaSetDevice(chunkId); 333 | 334 | //reduce 1. 335 | LDATrainPerplexityReduce1<<>>( 336 | doc.docChunkVec[chunkId]->deviceWordPerplexity, 337 | doc.docChunkVec[chunkId]->deviceWordPerplexityMid, 338 | doc.docChunkVec[chunkId]->numWords*(TrainBlockSize/32)); 339 | 340 | double testMid[ReduceParameter]; 341 | cudaMemcpy(testMid, doc.docChunkVec[chunkId]->deviceWordPerplexityMid, sizeof(double)*ReduceParameter, cudaMemcpyDeviceToHost); 342 | 343 | //cudaDeviceSynchronize(); 344 | //gpuErr(cudaPeekAtLastError()); 345 | 346 | //reduce 2. 347 | LDATrainPerplexityReduce2<<<1,32,0>>>(doc.docChunkVec[chunkId]->deviceWordPerplexityMid); 348 | 349 | cudaMemcpy(tmpSum, doc.docChunkVec[chunkId]->deviceWordPerplexityMid, sizeof(double), cudaMemcpyDeviceToHost); 350 | 351 | 352 | sum += tmpSum[0]; 353 | //printf("loglike:%.4f e10\n", sum); 354 | } 355 | return (sum/doc.numTokens); 356 | } 357 | else 358 | { 359 | 360 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++) 361 | { 362 | cudaSetDevice(chunkId); 363 | //reduce 1. 364 | LDATrainPerplexityReduce1<<>>( 365 | doc.docChunkVec[chunkId]->deviceWordPerplexity, 366 | doc.docChunkVec[chunkId]->deviceWordPerplexityMid, 367 | doc.docChunkVec[chunkId]->numWords); 368 | 369 | //cudaDeviceSynchronize(); 370 | //gpuErr(cudaPeekAtLastError()); 371 | 372 | //reduce 2. 373 | LDATrainPerplexityReduce2<<<1,32,0, streams[chunkId]>>>(doc.docChunkVec[chunkId]->deviceWordPerplexityMid); 374 | cudaMemcpyAsync(tmpSum + chunkId, doc.docChunkVec[chunkId]->deviceWordPerplexityMid, sizeof(double), cudaMemcpyDeviceToHost, streams[chunkId]); 375 | } 376 | 377 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++) 378 | cudaStreamSynchronize(streams[chunkId]); 379 | 380 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++) 381 | sum += tmpSum[chunkId]; 382 | 383 | return sum/doc.numTokens; 384 | } 385 | //return exp(-1*sum/doc.numTokens); 386 | } 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | -------------------------------------------------------------------------------- /src_culda/kernel/lda_train_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _LDA_TRAIN_KERNEL_H_ 2 | #define _LDA_TRAIN_KERNEL_H_ 3 | 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | 11 | #include "../model/culda_argument.h" 12 | #include "../model/doc.h" 13 | 14 | 15 | 16 | __global__ void initRandState(curandState *state); 17 | 18 | 19 | __global__ void LDAKernelTrain( 20 | int k, //parameters 21 | float alpha, 22 | float beta, 23 | int numDocs, // corpora 24 | int numWords, 25 | long long numTokens, 26 | long long *wordIndices, // data, numWords + 1 27 | int *slotIdToWordId, // data, numSlots 28 | long long *slotIndices, // data, numSlots*2 29 | int *wordTokens, // data, numTokens 30 | short *wordTopics, // data, numTokens 31 | short *thetaA, //model, values, thetaNNZ 32 | int *thetaMaxIA, //model, offsets, numDocs + 1, 33 | int *thetaCurIA, //model, offsets, numDocs, 34 | short *thetaJA, //model, column indices, thetaNNZ 35 | int docIdStart, 36 | PHITYPE *phiTopicWord, //model, numWords*k 37 | int *phiTopic, //model, k 38 | half *phiHead, //model, numWords*k 39 | curandState *randState, 40 | int randStateSize, 41 | int GPUid, 42 | double *wordPerplexity, 43 | long long *docRevIndices); 44 | 45 | double LDATrainPerplexity(Document &, cudaStream_t *streams = NULL); 46 | 47 | #endif -------------------------------------------------------------------------------- /src_culda/main.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | 17 | #include "./model/model_theta.h" 18 | #include "./model/culda_argument.h" 19 | #include "./model/vocab.h" 20 | #include "./model/doc.h" 21 | 22 | #include "./train/lda_train.h" 23 | 24 | 25 | using namespace std; 26 | 27 | 28 | bool ISNumerical(char *str) 29 | { 30 | int c = 0; 31 | while(*str != '\0') 32 | { 33 | if(isdigit(*str))c++; 34 | else return false; 35 | str++; 36 | } 37 | return c > 0; 38 | } 39 | 40 | Argument ParseArgument(int argc, char **argv) 41 | { 42 | vector args; 43 | for(int i = 0;i < argc; i++){ 44 | args.push_back(string(argv[i])); 45 | } 46 | 47 | if(argc == 1) 48 | throw invalid_argument("No argument found"); 49 | 50 | Argument argument; 51 | int i; 52 | 53 | for(i = 1;i < argc; i++){ 54 | 55 | if(args[i].compare("-g") == 0){ 56 | 57 | if((i + 1) >= argc)throw invalid_argument("need to specify a value after -g"); 58 | if(!ISNumerical(argv[i+1])) 59 | throw invalid_argument("-k should be followed by a positive integer"); 60 | argument.numGPUs = atoi(argv[i+1]); 61 | i++; 62 | } 63 | else if(args[i].compare("-k") == 0){ 64 | 65 | if((i + 1) >= argc)throw invalid_argument("need to specify a value after -k"); 66 | if(!ISNumerical(argv[i+1])) 67 | throw invalid_argument("-k should be followed by a positive integer"); 68 | argument.k = atoi(argv[i+1]); 69 | 70 | //TBD: check k 71 | i++; 72 | } 73 | else if(args[i].compare("-t") == 0){ 74 | if((i + 1) >= argc)throw invalid_argument("need to specify a value after -t"); 75 | if(!ISNumerical(argv[i+1])) 76 | throw invalid_argument("-t should be followed by a positive integer"); 77 | argument.iteration = atoi(argv[i+1]); 78 | 79 | //TBD: check t 80 | i++; 81 | } 82 | else if(args[i].compare("-s") == 0){ 83 | if((i + 1) >= argc)throw invalid_argument("need to specify a value after -s"); 84 | if(!ISNumerical(argv[i+1])) 85 | throw invalid_argument("-s should be followed by a positive integer"); 86 | argument.numWorkers = atoi(argv[i+1]); 87 | 88 | i++; 89 | } 90 | else if(args[i].compare("-a") == 0){ 91 | if((i + 1) >= argc)throw invalid_argument("need to specify a value after -a"); 92 | 93 | argument.alpha = atof(argv[i+1]); 94 | 95 | //TBD: check it 96 | i++; 97 | } 98 | else if(args[i].compare("-b") == 0){ 99 | if((i + 1) >= argc)throw invalid_argument("need to specify a value after -b"); 100 | //if(!ISNumerical(argv[i+1])) 101 | // throw invalid_argument("-b should be followed by a number"); 102 | argument.beta = atof(argv[i+1]); 103 | 104 | //TBD: check it 105 | i++; 106 | } 107 | else if(args[i].compare("-c") == 0){ 108 | if((i + 1) >= argc)throw invalid_argument("need to specify a value after -b"); 109 | //if(!ISNumerical(argv[i+1])) 110 | // throw invalid_argument("-b should be followed by a number"); 111 | argument.numChunks = atoi(argv[i+1]); 112 | 113 | //TBD: check it 114 | i++; 115 | } 116 | else if(args[i].compare("-i") == 0){ 117 | if((i + 1) >= argc)throw invalid_argument("need to specify a file name prefix after -i"); 118 | argument.inputFilePrefix = args[i+1]; 119 | i++; 120 | } 121 | else if(args[i].compare("-o") == 0){ 122 | if((i + 1) >= argc)throw invalid_argument("need to specify a file name prefix after -o"); 123 | argument.outputFilePrefix = args[i+1]; 124 | i++; 125 | } 126 | else break; 127 | 128 | } 129 | 130 | 131 | //process k 132 | if(argument.k%32 != 0){ 133 | printf("Warning: number of topics(k) has been rounded to multiples of 32.\n"); 134 | argument.k = (argument.k + 31)/32*32; 135 | } 136 | 137 | 138 | if (argument.numWorkers <= 0){ 139 | printf("Warning: wrong number of workers.\n"); 140 | argument.numWorkers = 1; 141 | } 142 | 143 | //process output file names. 144 | argument.outputWordFileName = argument.outputFilePrefix + ".word.full.txt"; 145 | argument.outputDocFileName = argument.outputFilePrefix + ".doc.full.txt"; 146 | 147 | //GPU number 148 | int deviceCount = 1; 149 | cudaError_t error_id = cudaGetDeviceCount(&deviceCount); 150 | 151 | if(deviceCount >= MaxNumGPU)deviceCount = MaxNumGPU; 152 | 153 | if(argument.numGPUs <= 0)argument.numGPUs = 1; 154 | if(argument.numGPUs > deviceCount){ 155 | printf("Warning: number of GPUs(%d) is larger than device count(%d), rounded to %d\n", argument.numGPUs,deviceCount,deviceCount); 156 | argument.numGPUs = deviceCount; 157 | } 158 | 159 | 160 | return argument; 161 | } 162 | 163 | 164 | int main(int argc, char**argv) 165 | { 166 | clock_t clockStart; 167 | Argument argument; 168 | 169 | printf("Parsing arguments ...\n"); 170 | try{ 171 | argument = ParseArgument(argc, argv); 172 | } 173 | catch(invalid_argument &e){ 174 | cout << "Error: " < 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include 13 | 14 | 15 | const int SCacheSize = 64; 16 | 17 | const int UpdateNumWorkers = 28*16; 18 | /*/ 19 | Optimal on Titan X: 24*16 20 | Optimal on P100 : 56*16 21 | Optimal on V100 : 80*16 22 | */ 23 | 24 | 25 | const int TrainBlockSize = 1024; 26 | const int NumConWorkers = 28*2; 27 | const int ShaMemPad = 0; 28 | 29 | 30 | const int MaxNumGPU = 32; 31 | const int ReduceParameter = 1024; 32 | 33 | //typedef unsigned short PHITYPE; 34 | typedef int PHITYPE; 35 | 36 | typedef int TokenIdxType; 37 | 38 | using namespace std; 39 | 40 | #define gpuErr(ans) { gpuAssert((ans), __FILE__, __LINE__); } 41 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) 42 | { 43 | if (code != cudaSuccess) 44 | { 45 | fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 46 | if (abort) exit(code); 47 | } 48 | } 49 | 50 | class Argument 51 | { 52 | public: 53 | int numGPUs; 54 | int k; 55 | int iteration; 56 | int numWorkers; 57 | int numChunks; 58 | std::string inputFilePrefix; 59 | 60 | std::string outputFilePrefix; 61 | 62 | std::string outputWordFileName; 63 | std::string outputDocFileName; 64 | 65 | float alpha; 66 | float beta; 67 | 68 | 69 | void printArgument(){ 70 | 71 | printf("numGPUs :%d\n", numGPUs); 72 | printf("k :%d\n", k); 73 | printf("iteration :%d\n", iteration); 74 | printf("numWorkers :%d\n", numWorkers); 75 | printf("numChunks :%d\n", numChunks); 76 | printf("alpha :%.2f\n", alpha); 77 | printf("beta :%.2f\n", beta); 78 | printf("prefix :%s\n", inputFilePrefix.c_str()); 79 | printf("outfile :%s\n", outputFilePrefix.c_str()); 80 | printf("\n"); 81 | } 82 | }; 83 | 84 | #endif -------------------------------------------------------------------------------- /src_culda/model/doc.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | 13 | #include "doc.h" 14 | 15 | using namespace std; 16 | //Definitions of Class Document's methods. 17 | 18 | 19 | Document::Document(): 20 | numDocs(0), 21 | numWords(0), 22 | numTokens(0), 23 | docIndices(NULL), 24 | docLength(NULL) 25 | { 26 | } 27 | 28 | Document::Document(const string &filePrefix, int argNumChunks): 29 | numDocs(0), 30 | numWords(0), 31 | numTokens(0), 32 | docIndices(NULL), 33 | docLength(NULL) 34 | { 35 | numChunks = argNumChunks; 36 | loadDocument(filePrefix, argNumChunks); 37 | } 38 | 39 | Document::Document(const Document &doc): 40 | numDocs(0), 41 | numWords(0), 42 | numTokens(0), 43 | docIndices(NULL), 44 | docLength(NULL) 45 | { 46 | 47 | //TBD!!! 48 | /* 49 | numWords = doc.numWords; 50 | numTokens = doc.numTokens; 51 | ids = doc.ids; 52 | 53 | tokens = new int[numTokens]; 54 | indices = new long long[numWords*2]; 55 | 56 | copy(doc.tokens, doc.tokens + numTokens, tokens); 57 | copy(doc.indices, doc.indices + numWords*2, indices); 58 | */ 59 | } 60 | 61 | 62 | void Document::loadDocument(const string &filePrefix, int argNumChunks) 63 | { 64 | numChunks = argNumChunks; 65 | 66 | /* Load docIndices and docLength*/ 67 | string docIdxFileName = filePrefix + ".doc.idx"; 68 | ifstream docIdxStream(docIdxFileName.c_str(), ios::in); 69 | if(!docIdxStream.is_open()){ 70 | cout << "File " << docIdxFileName << " open failed" << endl; 71 | exit(0); 72 | } 73 | 74 | numDocs = 0; 75 | int docId; 76 | long long endIdx; 77 | vector docIndicesVec; 78 | docIndicesVec.push_back(0); 79 | while(docIdxStream >> docId >> endIdx) 80 | { 81 | numDocs ++; 82 | docIndicesVec.push_back(endIdx); 83 | } 84 | docIdxStream.close(); 85 | 86 | docIndices = new long long[numDocs + 1]; 87 | docLength = new int[numDocs]; 88 | 89 | for(int i = 0;i < numDocs; i++) 90 | docLength[i] = docIndicesVec[i + 1] - docIndicesVec[i]; 91 | 92 | for(int i = 0;i < numDocs + 1; i++) 93 | docIndices[i] = docIndicesVec[i]; 94 | 95 | docIndicesVec.clear(); 96 | 97 | //load .chunk meta data. 98 | vector chunkDocVec; 99 | ifstream chunkFileStream((filePrefix + ".chunk").c_str(), ios::in); 100 | if(!chunkFileStream.is_open()){ 101 | cout << "File " << filePrefix << ".chunk open failed" << endl; 102 | exit(0); 103 | } 104 | chunkDocVec.push_back(0); 105 | int tmp1, tmp2; 106 | while(chunkFileStream >> tmp1 >> tmp2){ 107 | chunkDocVec.push_back(tmp2); 108 | } 109 | chunkFileStream.close(); 110 | 111 | if(chunkDocVec.size() != (numChunks + 1)){ 112 | printf("Error: numChunks(%d) does not match the chunk file\n",numChunks); 113 | exit(0); 114 | } 115 | 116 | //load data 117 | for(int chunkId = 0;chunkId < numChunks; chunkId ++){ 118 | docChunkVec.push_back( 119 | new DocChunk( 120 | chunkId, 121 | chunkDocVec[chunkId], 122 | chunkDocVec[chunkId + 1], 123 | numDocs, 124 | numChunks)); 125 | } 126 | 127 | for(int chunkId = 0; chunkId < numChunks; chunkId ++){ 128 | 129 | stringstream tmpStream; 130 | string wordIdxFileName, wordDataFileName; 131 | if(numChunks == 1){ 132 | tmpStream << filePrefix << ".word.idx"; 133 | tmpStream >> wordIdxFileName; 134 | tmpStream.clear(); 135 | tmpStream << filePrefix << ".word.data"; 136 | tmpStream >> wordDataFileName; 137 | } 138 | else{ 139 | tmpStream << filePrefix << ".word.idx" << chunkId; 140 | tmpStream >> wordIdxFileName; 141 | tmpStream.clear(); 142 | tmpStream << filePrefix << ".word.data" << chunkId; 143 | tmpStream >> wordDataFileName; 144 | } 145 | docChunkVec[chunkId]->loadChunk(wordIdxFileName, wordDataFileName, docLength); 146 | 147 | printf(" chunk %d loaded ...\n", chunkId); 148 | } 149 | numWords = docChunkVec[0]->numWords; 150 | printf("\n"); 151 | 152 | numTokens = 0; 153 | for(int chunkId = 0; chunkId < numChunks; chunkId ++) 154 | numTokens += docChunkVec[chunkId]->chunkNumTokens; 155 | 156 | } 157 | 158 | void Document::generateTopics(int k) //TBD: parallelization 159 | { 160 | 161 | printf("Initialize the topic for tokens ...\n\n"); 162 | for(int chunkId = 0; chunkId < numChunks; chunkId ++) 163 | docChunkVec[chunkId]->generateTopics(k); 164 | } 165 | 166 | void Document::clear() 167 | { 168 | numDocs = 0; 169 | numWords = 0; 170 | numTokens = 0; 171 | 172 | for(int chunkId = 0; chunkId < numChunks;chunkId ++) 173 | delete docChunkVec[chunkId]; 174 | 175 | if(docIndices != NULL) delete []docIndices; 176 | if(docLength != NULL) delete []docLength; 177 | 178 | docIndices = NULL; 179 | docLength = NULL; 180 | } 181 | 182 | void Document::printDocumentAbbr() 183 | { 184 | printf("numDocs : %d\n",numDocs); 185 | printf("numWords : %d\n",numWords); 186 | printf("numTokens: %lld\n",numTokens); 187 | 188 | printf("\n"); 189 | } 190 | 191 | 192 | void Document::printDocumentFull() 193 | { 194 | printf("----doc info-----\n"); 195 | printf("numDocs : %d\n",numDocs); 196 | printf("numWords : %d\n",numWords); 197 | printf("numTokens: %lld\n",numTokens); 198 | 199 | for(int chunkId = 0; chunkId < numChunks; chunkId ++){ 200 | printf("**chunkId:%d\n", chunkId); 201 | printf("word range:\n"); 202 | for(long long i = 0;i < numWords; i++) 203 | printf("word%2d %6lld - %lld\n",i, 204 | docChunkVec[chunkId]->wordIndices[i], 205 | docChunkVec[chunkId]->wordIndices[i+1]); 206 | 207 | printf("token list:\n"); 208 | printf("tokenId, wordId, DocId, topics\n"); 209 | for(long long wordId = 0;wordId < numWords; wordId++){ 210 | 211 | long long start = docChunkVec[chunkId]->wordIndices[wordId]; 212 | long long end = docChunkVec[chunkId]->wordIndices[wordId + 1]; 213 | for(long long tokenId = start; tokenId < end; tokenId ++) 214 | printf("%lld, %d, %d, %d\n",tokenId, wordId, 215 | docChunkVec[chunkId]->wordTokens[tokenId], 216 | docChunkVec[chunkId]->wordTopics[tokenId]); 217 | } 218 | } 219 | 220 | /* 221 | for(long long i = 0;i < numTokens;i++) 222 | printf("%4lld %lld %lld\n", i, wordTokens[i], wordTopics[i]); 223 | */ 224 | 225 | } 226 | 227 | void Document::saveTopics(string fileName) 228 | { 229 | printf("Saving topics ...\n"); 230 | } 231 | 232 | 233 | 234 | -------------------------------------------------------------------------------- /src_culda/model/doc.h: -------------------------------------------------------------------------------- 1 | #ifndef _DOC_H_ 2 | 3 | #define _DOC_H_ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "doc_chunk.h" 13 | 14 | using namespace std; 15 | 16 | /* 17 | * word-first format 18 | * fileName.word.data: docId list 19 | * fileName.word.idx: each line is consisted of: wordId, startIdx, endIdx; 20 | * fileName.vocab: word-2-id mapping 21 | */ 22 | 23 | 24 | 25 | class Document 26 | { 27 | 28 | public: 29 | 30 | int numDocs; 31 | int numWords; 32 | long long numTokens; 33 | int numChunks; 34 | int numWorkers; 35 | 36 | long long *docIndices; // numDocs + 1 37 | int *docLength; // numDocs 38 | 39 | vector docChunkVec; 40 | 41 | Document(); 42 | Document(const string &filePrefix, int argNumChunks); 43 | Document(const Document &doc); 44 | 45 | void loadDocument(const string &filePrefix, int argNumChunks); 46 | 47 | void clear(); 48 | 49 | ~Document(){ clear(); } 50 | 51 | void printDocumentAbbr(); 52 | void printDocumentFull(); 53 | void generateTopics(int k); 54 | void saveTopics(string fileName); 55 | 56 | }; 57 | 58 | 59 | #endif -------------------------------------------------------------------------------- /src_culda/model/doc_chunk.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include "doc_chunk.h" 12 | 13 | using namespace std; 14 | 15 | DocChunk::DocChunk(): 16 | wordIndices(NULL), 17 | slotIdToWordId(NULL), 18 | slotIndices(NULL), 19 | wordTokens(NULL), 20 | wordTopics(NULL), 21 | wordPerplexity(NULL), 22 | deviceWordIndices(NULL), 23 | deviceSlotIdToWordId(NULL), 24 | deviceSlotIndices(NULL), 25 | deviceWordTokens(NULL), 26 | deviceWordTopics(NULL), 27 | deviceWordPerplexity(NULL), 28 | deviceWordPerplexityMid(NULL), 29 | docRevIndices(NULL), 30 | docRevIdx(NULL), 31 | deviceDocRevIndices(NULL), 32 | deviceDocRevIdx(NULL) 33 | { 34 | } 35 | 36 | DocChunk::DocChunk(int argChunkId, int argDocIdStart, int argDocIdEnd, int argNumDocs, int argNumChunks): 37 | wordIndices(NULL), 38 | slotIdToWordId(NULL), 39 | slotIndices(NULL), 40 | wordTokens(NULL), 41 | wordTopics(NULL), 42 | wordPerplexity(NULL), 43 | deviceWordIndices(NULL), 44 | deviceSlotIdToWordId(NULL), 45 | deviceSlotIndices(NULL), 46 | deviceWordTokens(NULL), 47 | deviceWordTopics(NULL), 48 | deviceWordPerplexity(NULL), 49 | deviceWordPerplexityMid(NULL), 50 | docRevIndices(NULL), 51 | docRevIdx(NULL), 52 | deviceDocRevIndices(NULL), 53 | deviceDocRevIdx(NULL) 54 | { 55 | 56 | chunkId = argChunkId; 57 | docIdStart = argDocIdStart; 58 | docIdEnd = argDocIdEnd; 59 | chunkNumDocs = docIdEnd - docIdStart; 60 | numDocs = argNumDocs; 61 | numChunks = argNumChunks; 62 | } 63 | 64 | struct pthreadArgTheta 65 | { 66 | int numWords; 67 | int docStart; 68 | int docEnd; 69 | long long *wordIndices; 70 | int *wordTokens; 71 | 72 | vector > *tmpDocPtr; 73 | }; 74 | 75 | static void *ThetaDocReverse(void *arg) 76 | { 77 | pthreadArgTheta *localArg = (pthreadArgTheta*)arg; 78 | 79 | for(int wordId = 0; wordId < localArg->numWords; wordId ++){ 80 | 81 | for(long long tokenId = localArg->wordIndices[wordId]; 82 | tokenId < localArg->wordIndices[wordId + 1]; 83 | tokenId ++){ 84 | 85 | int tmpDocId = localArg->wordTokens[tokenId]; 86 | if( tmpDocId >= localArg->docStart && tmpDocId < localArg->docEnd) 87 | ((*(localArg->tmpDocPtr))[tmpDocId]).push_back(tokenId); 88 | } 89 | } 90 | } 91 | 92 | 93 | class SortClass 94 | { 95 | public: 96 | int wordId; 97 | int wordLen; 98 | SortClass(int a, int b):wordId(a),wordLen(b){} 99 | 100 | friend bool operator<(const SortClass &a, const SortClass &b) 101 | { 102 | return a.wordLen > b.wordLen; 103 | } 104 | }; 105 | 106 | void DocChunk::loadChunk(string wordIdxFileName, string wordDataFileName, int *docLength) 107 | { 108 | 109 | printf(" loading chunk %d ...\n", chunkId); 110 | clock_t clockStart = clock(); 111 | /* load wordIndices & wordLength*/ 112 | ifstream wordIdxStream(wordIdxFileName.c_str(), ios::in); 113 | if(!wordIdxStream.is_open()){ 114 | cout << "File " << wordIdxFileName << " open failed" << endl; 115 | exit(0); 116 | } 117 | 118 | numWords = 0; 119 | long long wordId, endIdx; 120 | vectorwordIndicesVec; 121 | wordIndicesVec.push_back(0); 122 | while(wordIdxStream >> wordId >> endIdx) 123 | wordIndicesVec.push_back(endIdx); 124 | 125 | numWords = wordIndicesVec.size() - 1; 126 | wordIdxStream.close(); 127 | 128 | vector wordLengthVec; 129 | wordIndices = new long long[numWords + 1]; 130 | 131 | for(int i = 0;i < numWords; i++) 132 | wordLengthVec.push_back(wordIndicesVec[i + 1] - wordIndicesVec[i]); 133 | for(int i = 0;i < numWords + 1; i++) 134 | wordIndices[i] = wordIndicesVec[i]; 135 | 136 | /* load token number */ 137 | long long wordDataFileSize; 138 | ifstream wordDataStream(wordDataFileName.c_str(), ios::in|ios::ate); 139 | if(!wordDataStream.is_open()){ 140 | cout << "File " << wordDataFileName << " open failed" << endl; 141 | exit(0); 142 | } 143 | wordDataFileSize = wordDataStream.tellg(); 144 | chunkNumTokens = wordDataFileSize/sizeof(int); 145 | wordDataStream.close(); 146 | 147 | /* sort words & slice words into slots when necessary */ 148 | vector sortVec; 149 | for(int i = 0;i < numWords;i++) 150 | sortVec.push_back(SortClass(i, wordLengthVec[i])); 151 | sort(sortVec.begin(), sortVec.end()); 152 | 153 | for(int i = 0;i < 10;i++) 154 | printf("i:%d, wordId:%d, len:%d\n", i, sortVec[i].wordId, sortVec[i].wordLen); 155 | 156 | int aveTokens = chunkNumTokens/NumConWorkers; 157 | printf("aveTokens:%d\n", aveTokens); 158 | 159 | vector slotToWordVec; 160 | vector slotIndicesVec; 161 | numSlots = 0; 162 | for(int i = 0; i < numWords; i ++){ 163 | //printf("%d\n",i); 164 | int tmpWordId = sortVec[i].wordId; 165 | int tmpWordLen = sortVec[i].wordLen; 166 | 167 | if(tmpWordLen > 1.05*aveTokens){ 168 | 169 | int tmpNumsSlices = (tmpWordLen + aveTokens - 1)/aveTokens; 170 | int tmpSliceSize = (tmpWordLen + tmpNumsSlices - 1)/tmpNumsSlices; 171 | 172 | for(int i = 0;i < tmpNumsSlices; i ++){ 173 | 174 | //printf("i:%d\n", i); 175 | long long tmpStartIdx = wordIndices[tmpWordId] + tmpSliceSize*i; 176 | long long tmpEndIdx = tmpStartIdx + tmpSliceSize; 177 | if(tmpEndIdx >= wordIndices[tmpWordId + 1]) 178 | tmpEndIdx = wordIndices[tmpWordId + 1]; 179 | 180 | slotToWordVec.push_back(tmpWordId); 181 | slotIndicesVec.push_back(tmpStartIdx); 182 | slotIndicesVec.push_back(tmpEndIdx); 183 | numSlots ++; 184 | } 185 | 186 | } 187 | else{ 188 | slotToWordVec.push_back(tmpWordId); 189 | slotIndicesVec.push_back(wordIndices[tmpWordId]); 190 | slotIndicesVec.push_back(wordIndices[tmpWordId + 1]); 191 | numSlots ++; 192 | } 193 | 194 | } 195 | 196 | for(int i = 0;i < 11;i ++)printf("%d\n",wordIndicesVec[i]); 197 | for(int i = 0;i < 10;i++) 198 | printf("i:%d, wordId:%d, len:%d, start:%lld, end:%lld\n", 199 | i, sortVec[i].wordId, sortVec[i].wordLen, wordIndices[sortVec[i].wordId] , wordIndices[sortVec[i].wordId + 1]); 200 | 201 | for(int i = 0;i < 10; i++) 202 | printf("slot:%d, wordId:%d, len:%d, start:%lld, end:%lld\n", i, slotToWordVec[i], slotIndicesVec[i*2 + 1] - slotIndicesVec[i*2], slotIndicesVec[i*2], slotIndicesVec[i*2+1]); 203 | 204 | printf("numSlots:%d\n", numSlots); 205 | 206 | slotIdToWordId = new int[numSlots]; 207 | slotIndices = new long long[numSlots*2]; 208 | 209 | for(int i = 0;i < numSlots; i++){ 210 | slotIdToWordId[i] = slotToWordVec[i]; 211 | slotIndices[i*2] = slotIndicesVec[i*2]; 212 | slotIndices[i*2 + 1] = slotIndicesVec[i*2 + 1]; 213 | } 214 | 215 | /* load tokens */ 216 | if(wordTokens != NULL)delete []wordTokens; 217 | if(wordTopics != NULL)delete []wordTopics; 218 | wordTokens = new int[chunkNumTokens]; 219 | wordTopics = new short[chunkNumTokens]; 220 | 221 | wordDataStream.open(wordDataFileName.c_str(), ios::in); 222 | for(long long i = 0;i < chunkNumTokens;i++) 223 | wordDataStream.read((char*)(&(wordTokens[i])), sizeof(int)); 224 | 225 | //generate doc reverse info 226 | if(docRevIndices != NULL)delete []docRevIndices; 227 | if(docRevIdx != NULL)delete []docRevIdx; 228 | docRevIndices = new long long[numDocs + 1](); 229 | docRevIdx = new TokenIdxType[chunkNumTokens](); 230 | 231 | for(int docId = 0, offset = 0; docId < numDocs; docId ++){ 232 | if(docId >= docIdStart && docId < docIdEnd){ 233 | docRevIndices[docId] = offset; 234 | docRevIndices[docId + 1] = offset + docLength[docId]; 235 | offset += docLength[docId]; 236 | } 237 | else{ 238 | docRevIndices[docId] = offset; 239 | docRevIndices[docId + 1] = offset; 240 | } 241 | } 242 | vector tmpDocPtr; 243 | for(int docId = 0; docId < numDocs;docId ++) 244 | tmpDocPtr.push_back(docRevIndices[docId]); 245 | 246 | for(int wordId = 0;wordId < numWords; wordId ++){ 247 | for(long long tokenId = wordIndices[wordId]; 248 | tokenId < wordIndices[wordId + 1]; 249 | tokenId ++){ 250 | 251 | int tmpDocId = wordTokens[tokenId]; 252 | docRevIdx[tmpDocPtr[tmpDocId]] = int(tokenId); 253 | tmpDocPtr[tmpDocId] ++; 254 | } 255 | } 256 | } 257 | 258 | 259 | void DocChunk::generateTopics(int k){ 260 | 261 | srand (time(NULL)); 262 | for(long long i = 0; i < chunkNumTokens;i++) 263 | wordTopics[i] = short(rand()%k); 264 | } 265 | 266 | struct wordStruct{ 267 | int wordId; 268 | int numTokens; 269 | 270 | wordStruct(int arg1, int arg2){ 271 | wordId = arg1; 272 | numTokens = arg2; 273 | } 274 | }; 275 | 276 | 277 | void DocChunk::allocGPU(int GPUid) 278 | { 279 | cudaSetDevice(GPUid); 280 | 281 | if(deviceWordIndices != NULL)cudaFree(deviceWordIndices); 282 | if(deviceSlotIdToWordId != NULL)cudaFree(deviceSlotIdToWordId); 283 | if(deviceSlotIndices != NULL)cudaFree(deviceSlotIndices); 284 | if(deviceWordTokens != NULL)cudaFree(deviceWordTokens); 285 | if(deviceWordTopics != NULL)cudaFree(deviceWordTopics); 286 | if(deviceWordPerplexity != NULL)cudaFree(deviceWordPerplexity); 287 | if(deviceWordPerplexityMid != NULL)cudaFree(deviceWordPerplexityMid); 288 | 289 | if(deviceDocRevIndices != NULL)cudaFree(deviceDocRevIndices); 290 | if(deviceDocRevIdx != NULL)cudaFree(deviceDocRevIdx); 291 | 292 | cudaMalloc((void**)&deviceWordIndices, (numWords + 1)*sizeof(long long)); 293 | cudaMalloc((void**)&deviceSlotIdToWordId, numSlots*sizeof(int)); 294 | cudaMalloc((void**)&deviceSlotIndices, numSlots*2*sizeof(long long)); 295 | 296 | cudaMalloc((void**)&deviceWordTokens, chunkNumTokens*sizeof(int)); 297 | cudaMalloc((void**)&deviceWordTopics, chunkNumTokens*sizeof(short)); 298 | 299 | cudaMalloc((void**)&deviceWordPerplexity, numWords*(TrainBlockSize/32)*sizeof(double)); 300 | cudaMalloc((void**)&deviceWordPerplexityMid, ReduceParameter*sizeof(double)); 301 | 302 | gpuErr(cudaPeekAtLastError()); 303 | 304 | cudaMalloc((void**)&deviceDocRevIndices, (numDocs+1)*sizeof(long long)); 305 | cudaMalloc((void**)&deviceDocRevIdx, chunkNumTokens*sizeof(TokenIdxType)); 306 | 307 | long long totalByte = (numWords + 1)*sizeof(long long) + 308 | chunkNumTokens*sizeof(int) + 309 | chunkNumTokens*sizeof(short) + 310 | numWords*sizeof(double) + 311 | ReduceParameter*sizeof(double) + 312 | (numDocs + 1)*sizeof(long long) + 313 | chunkNumTokens*sizeof(TokenIdxType); 314 | printf("docChunk size:%.3f GB\n", totalByte/(1024.0*1024.0*1024.0)); 315 | 316 | cudaDeviceSynchronize(); 317 | gpuErr(cudaPeekAtLastError()); 318 | } 319 | 320 | void DocChunk::toGPU() 321 | { 322 | 323 | //tokens 324 | cudaMemcpy(deviceWordIndices, 325 | wordIndices, 326 | sizeof(long long)*(numWords + 1), 327 | cudaMemcpyHostToDevice); 328 | cudaMemcpy(deviceSlotIdToWordId, 329 | slotIdToWordId, 330 | sizeof(int)*numSlots, 331 | cudaMemcpyHostToDevice); 332 | cudaMemcpy(deviceSlotIndices, 333 | slotIndices, 334 | sizeof(long long)*numSlots*2, 335 | cudaMemcpyHostToDevice); 336 | cudaMemcpy(deviceWordTokens, 337 | wordTokens, 338 | sizeof(int)*chunkNumTokens, 339 | cudaMemcpyHostToDevice); 340 | cudaMemcpy(deviceWordTopics, 341 | wordTopics, 342 | sizeof(short)*chunkNumTokens, 343 | cudaMemcpyHostToDevice); 344 | 345 | 346 | //doc rev data 347 | cudaMemcpy(deviceDocRevIndices, 348 | docRevIndices, 349 | sizeof(long long)*(numDocs + 1), 350 | cudaMemcpyHostToDevice); 351 | cudaMemcpy(deviceDocRevIdx, 352 | docRevIdx, 353 | sizeof(TokenIdxType)*chunkNumTokens, 354 | cudaMemcpyHostToDevice); 355 | 356 | } 357 | 358 | void DocChunk::toCPU() 359 | { 360 | printf("DocChunk::toCPU() ChunkId:%d...\n", chunkId); 361 | cudaMemcpy(wordTopics, 362 | deviceWordTopics, 363 | sizeof(short)*chunkNumTokens, 364 | cudaMemcpyDeviceToHost); 365 | 366 | printf("finished DocChunk::toCPU() ...\n"); 367 | } 368 | -------------------------------------------------------------------------------- /src_culda/model/doc_chunk.h: -------------------------------------------------------------------------------- 1 | #ifndef _DOC_CHUNK_ 2 | #define _DOC_CHUNK_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | #include "culda_argument.h" 16 | 17 | using namespace std; 18 | 19 | class DocChunk 20 | { 21 | public: 22 | int chunkId; 23 | int docIdStart; 24 | int docIdEnd; 25 | 26 | int numWorkers; 27 | int numDocs; 28 | int numWords; 29 | int numSlots; 30 | int numChunks; 31 | 32 | long long chunkNumTokens; 33 | int chunkNumDocs; 34 | 35 | /* original input data */ 36 | long long *wordIndices; // numberWords + 1 37 | int *slotIdToWordId; // numSlots 38 | long long *slotIndices; // numSlots*2 39 | int *wordTokens; // chunkNumTokens 40 | short *wordTopics; // chunkNumTokens 41 | double *wordPerplexity; // chunkNumTokens 42 | 43 | long long *deviceWordIndices; // numWords + 1 44 | int *deviceSlotIdToWordId; // numSlots 45 | long long *deviceSlotIndices; // numSlots*2 46 | int *deviceWordTokens; // chunkNumTokens 47 | short *deviceWordTopics; // chunkNumTokens 48 | double *deviceWordPerplexity; // chunkNumTokens 49 | 50 | double *deviceWordPerplexityMid; 51 | 52 | /* reverse doc data */ 53 | long long *docRevIndices; // numDocs + 1 54 | TokenIdxType *docRevIdx; // chunkTokenSize 55 | 56 | long long *deviceDocRevIndices; // numDocs + 1 57 | TokenIdxType *deviceDocRevIdx; // chunkTokenSize 58 | 59 | DocChunk(); 60 | DocChunk(int argChunkId, 61 | int argDocIdStart, 62 | int argDocIdEnd, 63 | int argNumDocs, 64 | int argNumChunks); 65 | ~DocChunk() 66 | { 67 | 68 | if(wordIndices != NULL)delete []wordIndices; 69 | if(slotIdToWordId != NULL)delete []slotIdToWordId; 70 | if(slotIndices != NULL)delete []slotIndices; 71 | if(wordTokens != NULL)delete []wordTokens; 72 | if(wordTopics != NULL)delete []wordTopics; 73 | if(wordPerplexity != NULL)delete []wordPerplexity; 74 | 75 | if(deviceWordIndices != NULL)cudaFree(deviceWordIndices); 76 | if(deviceSlotIdToWordId != NULL)cudaFree(deviceSlotIdToWordId); 77 | if(deviceSlotIndices != NULL)cudaFree(deviceSlotIndices); 78 | if(deviceWordTokens != NULL)cudaFree(deviceWordTokens); 79 | if(deviceWordTopics != NULL)cudaFree(deviceWordTokens); 80 | if(deviceWordPerplexity != NULL)cudaFree(deviceWordPerplexity); 81 | if(deviceWordPerplexityMid != NULL)cudaFree(deviceWordPerplexityMid); 82 | 83 | if(deviceDocRevIndices != NULL)cudaFree(deviceDocRevIndices); 84 | if(deviceDocRevIdx != NULL)cudaFree(deviceDocRevIdx); 85 | 86 | } 87 | 88 | void loadChunk(string, string, int*); 89 | void generateTopics(int k); 90 | 91 | void allocGPU(int); 92 | void toGPU(); 93 | void toCPU(); 94 | 95 | 96 | }; 97 | 98 | #endif -------------------------------------------------------------------------------- /src_culda/model/model_phi.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include // std::time 10 | #include // std::rand, std::srand 11 | #include // std::memset 12 | #include 13 | 14 | #include 15 | #include "model_phi.h" 16 | #include "vocab.h" 17 | 18 | 19 | 20 | /* Implementations of class ModelPhi */ 21 | 22 | ModelPhi::ModelPhi(): 23 | k(0), 24 | numGPUs(1), 25 | numDocs(0), 26 | numWords(0), 27 | numChunks(1) 28 | { 29 | for(int i = 0;i < MaxNumGPU;i++){ 30 | hostPhiTopicWordShort[i] = NULL; 31 | hostPhiTopic[i] = NULL; 32 | } 33 | clearPtr(); 34 | } 35 | 36 | ModelPhi::ModelPhi( 37 | int argK, int argGPUs, int argDocs, int argWords, int argChunks): 38 | k(argK), 39 | numGPUs(argGPUs), 40 | numDocs(argDocs), 41 | numWords(argWords), 42 | numChunks(argChunks) 43 | { 44 | for(int i = 0;i < MaxNumGPU;i++){ 45 | hostPhiTopicWordShort[i] = NULL; 46 | hostPhiTopic[i] = NULL; 47 | } 48 | clearPtr(); 49 | } 50 | 51 | void ModelPhi::InitData(Document &doc) 52 | { 53 | 54 | clearPtr(); 55 | if(k <= 0 || numDocs <= 0 || numWords <= 0)return; 56 | 57 | //prepare data space for phi. 58 | for(int chunkId = 0; chunkId < numChunks; chunkId ++){ 59 | hostPhiTopicWordShort[chunkId] = new PHITYPE[k*numWords](); 60 | hostPhiTopic[chunkId] = new int[k](); 61 | } 62 | 63 | for(int GPUid = 0; GPUid < numGPUs; GPUid ++){ 64 | ModelPhiGPU *tmpPtr = new ModelPhiGPU(k, numGPUs, GPUid, numDocs, numWords); 65 | tmpPtr->allocGPU(); 66 | phiChunkVec.push_back(tmpPtr); 67 | } 68 | } 69 | 70 | void ModelPhi::UpdatePhiGPU(Document &doc, int chunkId, cudaStream_t stream) 71 | { 72 | phiChunkVec[chunkId]->UpdatePhiGPU(doc, chunkId, stream); 73 | } 74 | 75 | void ModelPhi::UpdatePhiHead(float beta, cudaStream_t *stream) 76 | { 77 | 78 | if(stream != NULL){ 79 | for(int i = 0;i < numGPUs; i++) 80 | phiChunkVec[i]->UpdatePhiHead(beta, stream[i]); 81 | } 82 | else{ 83 | for(int i = 0;i < numGPUs; i++) 84 | phiChunkVec[i]->UpdatePhiHead(beta); 85 | } 86 | } 87 | 88 | void ModelPhi::MasterGPUToCPU(cudaStream_t stream) 89 | { 90 | 91 | //phi 92 | cudaMemcpyAsync(hostPhiTopicWordShort[0], 93 | phiChunkVec[0]->devicePhiTopicWordShort, 94 | sizeof(PHITYPE)*k*numWords, 95 | cudaMemcpyDeviceToHost, 96 | stream); 97 | 98 | cudaMemcpyAsync(hostPhiTopic[0], 99 | phiChunkVec[0]->devicePhiTopic, 100 | sizeof(int)*k, 101 | cudaMemcpyDeviceToHost, 102 | stream); 103 | } 104 | 105 | void ModelPhi::MasterGPUCollect(int GPUid, cudaStream_t stream) 106 | { 107 | cudaMemcpyAsync(phiChunkVec[0]->devicePhiTopicWordShortCopy, 108 | phiChunkVec[GPUid]->devicePhiTopicWordShort, 109 | sizeof(PHITYPE)*k*numWords, 110 | cudaMemcpyDeviceToDevice, 111 | stream); 112 | cudaMemcpyAsync(phiChunkVec[0]->devicePhiTopicCopy, 113 | phiChunkVec[GPUid]->devicePhiTopic, 114 | sizeof(int)*k, 115 | cudaMemcpyDeviceToDevice, 116 | stream); 117 | } 118 | 119 | void ModelPhi::MasterGPUDistribute(int GPUid, cudaStream_t stream) 120 | { 121 | cudaMemcpyAsync(phiChunkVec[GPUid]->devicePhiTopicWordShort, 122 | phiChunkVec[0]->devicePhiTopicWordShort, 123 | sizeof(PHITYPE)*k*numWords, 124 | cudaMemcpyDeviceToDevice, 125 | stream); 126 | cudaMemcpyAsync(phiChunkVec[GPUid]->devicePhiTopic, 127 | phiChunkVec[0]->devicePhiTopic, 128 | sizeof(int)*k, 129 | cudaMemcpyDeviceToDevice, 130 | stream); 131 | } 132 | 133 | void ModelPhi::MasterGPUReduce(cudaStream_t stream) 134 | { 135 | cudaSetDevice(0); 136 | LDAUpdatePhiReduceAPI( 137 | k, 138 | numWords, 139 | phiChunkVec[0]->devicePhiTopicWordShort, 140 | phiChunkVec[0]->devicePhiTopicWordShortCopy, 141 | phiChunkVec[0]->devicePhiTopic, 142 | phiChunkVec[0]->devicePhiTopicCopy, 143 | stream); 144 | } 145 | 146 | void ModelPhi::clearPtr() 147 | { 148 | for(int i = 0;i < phiChunkVec.size(); i++) 149 | if(phiChunkVec[i] != NULL)delete phiChunkVec[i]; 150 | 151 | //CPU data release 152 | for(int i = 0;i < numChunks;i++){ 153 | if(hostPhiTopicWordShort[i] != NULL) delete hostPhiTopicWordShort[i]; 154 | if(hostPhiTopic[i] != NULL) delete hostPhiTopic[i]; 155 | } 156 | //printf("ModelPhi::clearPtr() finished\n"); 157 | } 158 | 159 | 160 | 161 | void ModelPhi::savePhi(string fileName) 162 | { 163 | printf("Saving phi ...\n"); 164 | 165 | ofstream phiStream(fileName.c_str(), ios::out); 166 | 167 | int tmpNumWords = numWords; 168 | tmpNumWords = 1000; 169 | for(int wordId = 0;wordId < tmpNumWords; wordId++){ 170 | phiStream << "Word id:" << wordId << ", "; 171 | 172 | for(int kite = 0; kite < k;kite++ ){ 173 | if(kite%32 == 0){ 174 | phiStream << endl; 175 | phiStream.width(2); 176 | phiStream << kite/32; 177 | phiStream.width(0); 178 | phiStream << ":"; 179 | } 180 | 181 | int tmpVal = hostPhiTopicWordShort[0][wordId*k + kite]; 182 | if(tmpVal == 0) 183 | phiStream << "_,"; 184 | else 185 | phiStream << hostPhiTopicWordShort[0][wordId*k + kite] << ","; 186 | } 187 | phiStream << endl; 188 | } 189 | } 190 | 191 | 192 | 193 | 194 | void ModelPhi::validPhi(Document&doc) 195 | { 196 | 197 | printf("Calling validPhi() ..."); 198 | int tmpPhi[1024]; 199 | int tmpPhiTopic[1024]; 200 | 201 | clock_t clockStart = clock(); 202 | int maxPhi = 0; 203 | // validate hostPhiTopicWord 204 | for(int wordId = 0; wordId < numWords; wordId ++){ 205 | 206 | for(int i = 0;i < 1024;i++)tmpPhi[i] = 0; 207 | 208 | //add 209 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++){ 210 | for(long long tokenId = doc.docChunkVec[chunkId]->wordIndices[wordId]; 211 | tokenId < doc.docChunkVec[chunkId]->wordIndices[wordId + 1]; 212 | tokenId ++){ 213 | 214 | tmpPhi[doc.docChunkVec[chunkId]->wordTopics[tokenId]] ++; 215 | } 216 | } 217 | 218 | //validate hostPhiTopicWord 219 | 220 | for(int i = 0;i < 1024; i++){ 221 | if(hostPhiTopicWordShort[0][wordId*k + i] > maxPhi) maxPhi = hostPhiTopicWordShort[0][wordId*k + i]; 222 | 223 | if(tmpPhi[i] != hostPhiTopicWordShort[0][wordId*k + i]){ 224 | 225 | printf("ValidPhi Error: wordId(%d), topic(%d), tmpphi[i](%d), phi(%d)\n", 226 | wordId, i, tmpPhi[i], hostPhiTopicWordShort[0][wordId*k + i]); 227 | 228 | printf("topic:%d, level1:%d, level2:%d\n", i, i/32, i%32); 229 | 230 | for(int j = 0;j < 32;j ++){ 231 | 232 | printf("tmpphi:\n"); 233 | printf("%2d:",j); 234 | for(int m = 0;m < 32;m ++){ 235 | printf("%d,",tmpPhi[j*32 + m]); 236 | } 237 | printf("\n"); 238 | 239 | printf("phi :\n"); 240 | printf("%2d:",j); 241 | for(int m = 0;m < 32;m ++){ 242 | printf("%d,",hostPhiTopicWordShort[0][wordId*k + j*32 + m]); 243 | } 244 | printf("\n"); 245 | 246 | } 247 | exit(0); 248 | } 249 | } 250 | } 251 | 252 | //validate hostPhiTopic 253 | for(int i = 0;i < 1024; i++)tmpPhiTopic[i] = 0; 254 | //Step 1: add 255 | 256 | for(int wordId = 0; wordId < numWords; wordId ++){ 257 | 258 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++){ 259 | for(long long tokenId = doc.docChunkVec[chunkId]->wordIndices[wordId]; 260 | tokenId < doc.docChunkVec[chunkId]->wordIndices[wordId + 1]; 261 | tokenId ++){ 262 | 263 | tmpPhiTopic[doc.docChunkVec[chunkId]->wordTopics[tokenId]] ++; 264 | } 265 | } 266 | } 267 | 268 | //Step 2: validate 269 | for(int i = 0;i < 1024;i++){ 270 | if(tmpPhiTopic[i] != hostPhiTopic[0][i]){ 271 | printf("ValidPhi Error 2: topic(%d), tmpPhiTopic(%d), hostPhiTopic[i](%d)\n", 272 | i, tmpPhiTopic[i], hostPhiTopic[i]); 273 | exit(0); 274 | } 275 | } 276 | 277 | printf("Validate Phi passed ...\n"); 278 | printf("ValidatePhi time:%.2fs\n", (clock() - clockStart)/(double)CLOCKS_PER_SEC); 279 | printf("max phi:%d\n", maxPhi); 280 | } 281 | 282 | 283 | /* 284 | class FreqTuple{ 285 | 286 | public: 287 | int id; 288 | int count; 289 | 290 | FreqTuple(int arg1, int arg2):id(arg1),count(arg2){} 291 | friend bool operator<(const FreqTuple &left ,const FreqTuple &right){return left.count > right.count;} 292 | }; 293 | 294 | void ModelPhi::saveWord(std::string wordFileName, Vocabulary &vocab) 295 | { 296 | 297 | printf("Saving word model ...\n"); 298 | ofstream wordFileStream(wordFileName.c_str(), ios::out); 299 | 300 | wordFileStream << "topic, total, top words" << endl; 301 | for(int kite = 0; kite < k; kite ++){ 302 | 303 | int totalCount = 0; 304 | vector wordVec; 305 | 306 | for(int wordId = 0; wordId < numWords; wordId ++){ 307 | 308 | int tmpCount = hostPhiTopicWord[wordId*k + kite]; 309 | if(tmpCount <= 0)continue; 310 | 311 | totalCount += tmpCount; 312 | wordVec.push_back(FreqTuple(wordId, tmpCount)); 313 | } 314 | 315 | sort(wordVec.begin(), wordVec.end()); 316 | 317 | //output 318 | if(totalCount <= 0)continue; 319 | 320 | wordFileStream.width(4); 321 | wordFileStream << kite << " " << totalCount << " "; 322 | for(int i = 0;i < wordVec.size(); i ++){ 323 | wordFileStream << "(" << vocab.getWordById(wordVec[i].id) << ","; 324 | wordFileStream << wordVec[i].count << ") "; 325 | } 326 | wordFileStream << endl; 327 | } 328 | } 329 | */ 330 | 331 | 332 | struct pthreadArgShort 333 | { 334 | PHITYPE *matrixA; 335 | PHITYPE *matrixB; 336 | int idxStart; 337 | int idxEnd; 338 | int matrixSize; 339 | }; 340 | 341 | struct pthreadArgInt 342 | { 343 | int *matrixA; 344 | int *matrixB; 345 | int idxStart; 346 | int idxEnd; 347 | int matrixSize; 348 | }; 349 | 350 | 351 | 352 | static void *PhiReduceThreadShort(void *arg) 353 | { 354 | pthreadArgShort *localArg = (pthreadArgShort*)arg; 355 | 356 | int startIdx = localArg->idxStart; 357 | int endIdx = localArg->idxEnd; 358 | if(endIdx >= localArg->matrixSize) 359 | endIdx = localArg->matrixSize; 360 | 361 | PHITYPE *matrixA = localArg->matrixA; 362 | PHITYPE *matrixB = localArg->matrixB; 363 | for(int i = startIdx; i < endIdx; i ++) 364 | matrixA[i] += matrixB[i]; 365 | } 366 | 367 | static void *PhiReduceThreadInt(void *arg) 368 | { 369 | pthreadArgInt *localArg = (pthreadArgInt*)arg; 370 | 371 | int startIdx = localArg->idxStart; 372 | int endIdx = localArg->idxEnd; 373 | if(endIdx >= localArg->matrixSize) 374 | endIdx = localArg->matrixSize; 375 | 376 | int *matrixA = localArg->matrixA; 377 | int *matrixB = localArg->matrixB; 378 | for(int i = startIdx; i < endIdx; i ++) 379 | matrixA[i] += matrixB[i]; 380 | } 381 | 382 | static void PhiReduceShort(PHITYPE *matrixA, PHITYPE *matrixB, int matrixSize) 383 | { 384 | 385 | 386 | struct timespec begin, end; 387 | double elapsed; 388 | clock_gettime(CLOCK_MONOTONIC, &begin); 389 | 390 | const int numThreads = 48; 391 | pthread_t threads[numThreads]; 392 | pthreadArgShort threadArgs[numThreads]; 393 | int perThreadSize = (matrixSize + numThreads - 1)/numThreads; 394 | 395 | //launch 396 | for(int threadId = 0; threadId < numThreads; threadId ++){ 397 | threadArgs[threadId].matrixA = matrixA; 398 | threadArgs[threadId].matrixB = matrixB; 399 | threadArgs[threadId].idxStart = perThreadSize*threadId; 400 | threadArgs[threadId].idxEnd = perThreadSize*threadId + perThreadSize; 401 | threadArgs[threadId].matrixSize = matrixSize; 402 | 403 | pthread_create(&(threads[threadId]), 404 | NULL, 405 | PhiReduceThreadShort, 406 | (void*)(&(threadArgs[threadId]))); 407 | } 408 | 409 | //join 410 | for(int threadId = 0; threadId < numThreads; threadId ++) 411 | pthread_join(threads[threadId], NULL); 412 | 413 | clock_gettime(CLOCK_MONOTONIC, &end); 414 | 415 | elapsed = end.tv_sec - begin.tv_sec; 416 | elapsed += (end.tv_nsec - begin.tv_nsec) / 1000000000.0; 417 | printf("PhiReduceShort: %.8lfs\n",elapsed); 418 | 419 | } 420 | 421 | static void PhiReduceInt(int *matrixA, int *matrixB, int matrixSize) 422 | { 423 | 424 | 425 | struct timespec begin, end; 426 | double elapsed; 427 | clock_gettime(CLOCK_MONOTONIC, &begin); 428 | 429 | const int numThreads = 16; 430 | pthread_t threads[numThreads]; 431 | pthreadArgInt threadArgs[numThreads]; 432 | int perThreadSize = (matrixSize + numThreads - 1)/numThreads; 433 | 434 | //launch 435 | for(int threadId = 0; threadId < numThreads; threadId ++){ 436 | threadArgs[threadId].matrixA = matrixA; 437 | threadArgs[threadId].matrixB = matrixB; 438 | threadArgs[threadId].idxStart = perThreadSize*threadId; 439 | threadArgs[threadId].idxEnd = perThreadSize*threadId + perThreadSize; 440 | threadArgs[threadId].matrixSize = matrixSize; 441 | 442 | pthread_create(&(threads[threadId]), 443 | NULL, 444 | PhiReduceThreadInt, 445 | (void*)(&(threadArgs[threadId]))); 446 | } 447 | 448 | //join 449 | for(int threadId = 0; threadId < numThreads; threadId ++) 450 | pthread_join(threads[threadId], NULL); 451 | 452 | clock_gettime(CLOCK_MONOTONIC, &end); 453 | 454 | elapsed = end.tv_sec - begin.tv_sec; 455 | elapsed += (end.tv_nsec - begin.tv_nsec) / 1000000000.0; 456 | printf("PhiReduceInt: %.8lfs\n",elapsed); 457 | 458 | } 459 | 460 | void ModelPhi::reduceCPU() 461 | { 462 | for(int i = 1;i < numChunks;i++){ 463 | PhiReduceShort(hostPhiTopicWordShort[0], hostPhiTopicWordShort[i], numWords*k); 464 | PhiReduceInt(hostPhiTopic[0], hostPhiTopic[i], k); 465 | } 466 | } 467 | 468 | 469 | 470 | 471 | -------------------------------------------------------------------------------- /src_culda/model/model_phi.h: -------------------------------------------------------------------------------- 1 | #ifndef _MODEL_PHI_H_ 2 | 3 | #define _MODEL_PHI_H_ 4 | 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include "culda_argument.h" 13 | #include "vocab.h" 14 | #include "doc.h" 15 | 16 | #include "../kernel/lda_train_kernel.h" 17 | #include "../kernel/lda_phi_kernel.h" 18 | #include "model_phi_gpu.h" 19 | 20 | using namespace std; 21 | 22 | class ModelPhi 23 | { 24 | public: 25 | int k; 26 | int numGPUs; 27 | int numDocs; 28 | int numWords; 29 | int numChunks; 30 | 31 | //cpu data 32 | PHITYPE *hostPhiTopicWordShort[MaxNumGPU]; 33 | int *hostPhiTopic[MaxNumGPU]; 34 | 35 | vector phiChunkVec; 36 | 37 | ModelPhi(); 38 | ModelPhi(int argK, int argGPU, int argDocs, int argWords, int numChunks); 39 | 40 | void InitData(Document&); 41 | void UpdatePhiGPU(Document&, int chunkId, cudaStream_t s=0); 42 | void UpdatePhiHead(float beta,cudaStream_t *stream=NULL); 43 | 44 | void clearPtr(); 45 | 46 | ~ModelPhi(){ clearPtr();} 47 | 48 | void MasterGPUCollect(int GPUid, cudaStream_t stream=0); 49 | void MasterGPUDistribute(int GPUid, cudaStream_t stream=0); 50 | void MasterGPUToCPU(cudaStream_t stream=0); 51 | void MasterGPUReduce(cudaStream_t stream=0); 52 | 53 | void reduceCPU(); 54 | 55 | void validPhi(Document&); 56 | void savePhi(string fileName); 57 | }; 58 | 59 | 60 | #endif -------------------------------------------------------------------------------- /src_culda/model/model_phi_gpu.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include // std::time 8 | #include // std::rand, std::srand 9 | #include // std::memset 10 | #include 11 | 12 | #include 13 | #include "model_phi_gpu.h" 14 | #include "vocab.h" 15 | 16 | 17 | 18 | ModelPhiGPU::ModelPhiGPU(): 19 | k(0), 20 | numGPUs(1), 21 | GPUid(0), 22 | numDocs(0), 23 | numWords(0), 24 | devicePhiTopicWordShort(NULL), 25 | devicePhiTopicWordSub(NULL), 26 | devicePhiTopic(NULL), 27 | devicePhiHead(NULL), 28 | devicePhiTopicWordShortCopy(NULL), 29 | devicePhiTopicCopy(NULL) 30 | { 31 | } 32 | 33 | ModelPhiGPU::ModelPhiGPU( 34 | int argk, 35 | int argNumGPUs, 36 | int argid, 37 | int argdoc, 38 | int argword): 39 | k(argk), 40 | numGPUs(argNumGPUs), 41 | GPUid(argid), 42 | numDocs(argdoc), 43 | numWords(argword), 44 | devicePhiTopicWordShort(NULL), 45 | devicePhiTopicWordSub(NULL), 46 | devicePhiTopic(NULL), 47 | devicePhiHead(NULL), 48 | devicePhiTopicWordShortCopy(NULL), 49 | devicePhiTopicCopy(NULL) 50 | { 51 | } 52 | 53 | void ModelPhiGPU::allocGPU() 54 | { 55 | cudaSetDevice(GPUid); 56 | cudaMalloc((void**)&devicePhiTopicWordShort, sizeof(PHITYPE)*k*numWords); 57 | cudaMalloc((void**)&devicePhiTopicWordSub, sizeof(int)*k*UpdateNumWorkers); 58 | cudaMalloc((void**)&devicePhiTopic, sizeof(int)*k); 59 | cudaMalloc((void**)&devicePhiHead, sizeof(half)*k*numWords); 60 | 61 | if(GPUid == 0 && numGPUs > 1){ 62 | cudaMalloc((void**)&devicePhiTopicWordShortCopy, sizeof(PHITYPE)*k*numWords); 63 | cudaMalloc((void**)&devicePhiTopicCopy, sizeof(int)*k); 64 | } 65 | long long totalByte = sizeof(PHITYPE)*k*numWords + 66 | sizeof(int)*k*UpdateNumWorkers + 67 | sizeof(int)*k + 68 | sizeof(half)*k*numWords; 69 | 70 | printf("phi sizeof:%.3f GB\n", totalByte/(1024.0*1024.0*1024.0)); 71 | 72 | cudaDeviceSynchronize(); 73 | gpuErr(cudaPeekAtLastError()); 74 | } 75 | 76 | void ModelPhiGPU::UpdatePhiGPU(Document &doc, int chunkId, cudaStream_t stream) 77 | { 78 | cudaSetDevice(GPUid); 79 | cudaMemsetAsync(devicePhiTopic, 0, k*sizeof(int), stream); 80 | 81 | LDAUpdatePhiAPI( 82 | k, 83 | numWords, 84 | doc.docChunkVec[chunkId]->deviceWordIndices, 85 | doc.docChunkVec[chunkId]->deviceWordTopics, 86 | devicePhiTopicWordShort, 87 | devicePhiTopicWordSub, 88 | devicePhiTopic, 89 | stream 90 | ); 91 | } 92 | 93 | void ModelPhiGPU::UpdatePhiHead(float beta, cudaStream_t stream) 94 | { 95 | 96 | cudaSetDevice(GPUid); 97 | //printf("ModelPhiGPU::UpdatePhiHead() ... id:%d\n", GPUid); 98 | LDAComputePhiHeadAPI( 99 | k, 100 | beta, 101 | numWords, 102 | devicePhiTopicWordShort, 103 | devicePhiTopic, 104 | devicePhiHead, 105 | stream); 106 | } 107 | 108 | void ModelPhiGPU::clearPtr() 109 | { 110 | 111 | if(devicePhiTopicWordShort != NULL) cudaFree(devicePhiTopicWordShort); 112 | if(devicePhiTopicWordSub != NULL) cudaFree(devicePhiTopicWordSub); 113 | if(devicePhiTopic != NULL) cudaFree(devicePhiTopic); 114 | if(devicePhiHead != NULL) cudaFree(devicePhiHead); 115 | 116 | devicePhiTopicWordSub = NULL; 117 | devicePhiTopicWordShort = NULL; 118 | devicePhiTopic = NULL; 119 | devicePhiHead = NULL; 120 | } 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /src_culda/model/model_phi_gpu.h: -------------------------------------------------------------------------------- 1 | #ifndef _MODEL_PHI_GPU_H_ 2 | #define _MODEL_PHI_GPU_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include "culda_argument.h" 11 | #include "vocab.h" 12 | #include "doc.h" 13 | 14 | #include "../kernel/lda_train_kernel.h" 15 | #include "../kernel/lda_phi_kernel.h" 16 | 17 | class ModelPhiGPU 18 | { 19 | public: 20 | int k; 21 | int numGPUs; 22 | int GPUid; 23 | int numDocs; 24 | int numWords; 25 | 26 | 27 | PHITYPE *devicePhiTopicWordShort; 28 | int *devicePhiTopicWordSub; 29 | int *devicePhiTopic; 30 | half *devicePhiHead; 31 | 32 | PHITYPE *devicePhiTopicWordShortCopy; 33 | int *devicePhiTopicCopy; 34 | 35 | ModelPhiGPU(); 36 | ModelPhiGPU(int, int, int, int, int); 37 | ~ModelPhiGPU(){clearPtr();} 38 | 39 | void allocGPU(); 40 | void UpdatePhiGPU(Document &, int, cudaStream_t stream=0); 41 | void UpdatePhiHead(float, cudaStream_t stream=0); 42 | 43 | void clearPtr(); 44 | }; 45 | 46 | #endif -------------------------------------------------------------------------------- /src_culda/model/model_theta.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include // std::time 10 | #include // std::rand, std::srand 11 | #include // std::memset 12 | #include 13 | 14 | #include 15 | #include "model_theta.h" 16 | #include "vocab.h" 17 | 18 | 19 | 20 | /* Implementations of class ModelTheta */ 21 | 22 | ModelTheta::ModelTheta(): 23 | k(0), 24 | numDocs(0), 25 | numWords(0), 26 | numChunks(0), 27 | thetaNNZ(0) 28 | { 29 | clearPtr(); 30 | } 31 | 32 | ModelTheta::ModelTheta(int argK,int argDocs, int argWords, int argNumChunks): 33 | k(argK), 34 | numDocs(argDocs), 35 | numWords(argWords), 36 | numChunks(argNumChunks), 37 | thetaNNZ(0) 38 | { 39 | clearPtr(); 40 | } 41 | 42 | 43 | 44 | void ModelTheta::InitData(Document &doc) 45 | { 46 | 47 | clearPtr(); 48 | 49 | vector docLenVec; 50 | for(int docId = 0; docId < numDocs; docId ++){ 51 | int tmpLen = doc.docLength[docId]; 52 | if(tmpLen >= k)tmpLen = k; 53 | 54 | tmpLen = ((tmpLen + 31)/32)*32; 55 | docLenVec.push_back(tmpLen); 56 | } 57 | 58 | //chunk by chunk 59 | for(int chunkId = 0; chunkId < numChunks; chunkId ++){ 60 | int tmpChunkNNZ = 0; 61 | 62 | for(int docId = doc.docChunkVec[chunkId]->docIdStart; 63 | docId < doc.docChunkVec[chunkId]->docIdEnd; 64 | docId ++) 65 | tmpChunkNNZ += docLenVec[docId]; 66 | 67 | ModelThetaChunk *tmpPtr = new ModelThetaChunk( 68 | k, 69 | doc.numDocs, 70 | doc.numWords, 71 | doc.numChunks, 72 | chunkId, 73 | doc.docChunkVec[chunkId]->docIdStart, 74 | doc.docChunkVec[chunkId]->docIdEnd, 75 | doc.docChunkVec[chunkId]->docIdEnd - doc.docChunkVec[chunkId]->docIdStart, 76 | tmpChunkNNZ); 77 | 78 | //printf("chunkId:%d, tmpChunkNNZ:%d\n", chunkId, tmpChunkNNZ); 79 | 80 | tmpPtr->InitData(docLenVec); 81 | thetaChunkVec.push_back(tmpPtr); 82 | } 83 | 84 | cudaDeviceSynchronize(); 85 | gpuErr(cudaPeekAtLastError()); 86 | } 87 | 88 | void ModelTheta::UpdateThetaGPU(Document &doc, cudaStream_t *stream) 89 | { 90 | cudaDeviceSynchronize(); 91 | gpuErr(cudaPeekAtLastError()); 92 | 93 | if(stream != NULL){ 94 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++) 95 | thetaChunkVec[chunkId]->UpdateThetaGPU(doc, stream[chunkId]); 96 | } 97 | else{ 98 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++) 99 | thetaChunkVec[chunkId]->UpdateThetaGPU(doc); 100 | } 101 | cudaDeviceSynchronize(); 102 | gpuErr(cudaPeekAtLastError()); 103 | } 104 | 105 | void ModelTheta::toGPU() 106 | { 107 | for(int chunkId = 0; chunkId < numChunks; chunkId ++) 108 | thetaChunkVec[chunkId]->toGPU(); 109 | } 110 | 111 | void ModelTheta::toCPU() 112 | { 113 | for(int chunkId = 0; chunkId < numChunks; chunkId ++) 114 | thetaChunkVec[chunkId]->toCPU(); 115 | 116 | } 117 | 118 | void ModelTheta::clearPtr() 119 | { 120 | for(int i = 0;i < thetaChunkVec.size(); i++) 121 | delete thetaChunkVec[i]; 122 | } 123 | 124 | 125 | /* 126 | float ModelTheta::countZero() 127 | { 128 | toCPU(); 129 | long long totalEntry = 0; 130 | long long zeroEntry = 0; 131 | for(int i = 0;i < numDocs;i++){ 132 | for(int j = hostThetaMaxIA[i];j < hostThetaMaxIA[i+1];j++){ 133 | totalEntry++; 134 | if(hostThetaA[j] == 0)zeroEntry ++; 135 | } 136 | } 137 | return 1.0*zeroEntry/totalEntry; 138 | } 139 | */ 140 | 141 | /* 142 | float ModelTheta::countIA() 143 | { 144 | toCPU(); 145 | 146 | long long total = 0; 147 | long long cur = 0; 148 | for(int i = 0;i < numDocs;i++){ 149 | int startIdx = hostThetaMaxIA[i]; 150 | int maxEndIdx = hostThetaMaxIA[i+1]; 151 | int curEndIdx = hostThetaCurIA[i]; 152 | total += maxEndIdx - startIdx; 153 | cur += curEndIdx - startIdx; 154 | } 155 | 156 | return 1.0*cur/total; 157 | } 158 | */ 159 | 160 | 161 | /* 162 | void ModelTheta::saveTheta(string fileName) 163 | { 164 | 165 | printf("Saving theta ...\n"); 166 | 167 | ofstream thetaStream(fileName.c_str(), ios::out); 168 | int tmpNumDocs = numDocs; 169 | tmpNumDocs = 1000; 170 | for(int i = 0;i < tmpNumDocs;i++){ 171 | thetaStream << "Doc id:" << i << ", "; 172 | thetaStream << "len:" << hostThetaMaxIA[i+1] - hostThetaMaxIA[i] << ", "; 173 | thetaStream << "MaxIA:" << hostThetaMaxIA[i] << " - " << hostThetaMaxIA[i + 1] << ", "; 174 | thetaStream << "CurIA:" << hostThetaCurIA[i]; 175 | thetaStream << endl; 176 | 177 | int totalA = 0; 178 | for(int j = hostThetaMaxIA[i]; j < hostThetaMaxIA[i+1] ; j++){ 179 | 180 | thetaStream << "IA(" << j << "), " 181 | << "JA(" << hostThetaJA[j] << "), " 182 | << "A(" << hostThetaA[j] << ")\n"; 183 | totalA += hostThetaA[j]; 184 | } 185 | thetaStream << "total: " << totalA << "\n"; 186 | } 187 | } 188 | */ 189 | 190 | 191 | void ModelTheta::validTheta(Document &doc) 192 | { 193 | for(int chunkId = 0; chunkId < numChunks; chunkId ++) 194 | thetaChunkVec[chunkId]->validTheta(doc); 195 | } 196 | 197 | class FreqTuple{ 198 | 199 | public: 200 | int id; 201 | int count; 202 | 203 | FreqTuple(int arg1, int arg2):id(arg1),count(arg2){} 204 | friend bool operator<(const FreqTuple &left ,const FreqTuple &right){return left.count > right.count;} 205 | }; 206 | 207 | /* 208 | void ModelTheta::saveDoc(std::string docFileName) 209 | { 210 | ofstream docFileStream(docFileName.c_str(), ios::out); 211 | 212 | printf("Saving doc model ...\n"); 213 | docFileStream << "Doc, total, top topics" << endl; 214 | 215 | for(int docId = 0; docId < numDocs; docId ++){ 216 | 217 | int totalCount = 0; 218 | vector topicVec; 219 | for(int tmpIdx = hostThetaMaxIA[docId]; tmpIdx < hostThetaMaxIA[docId + 1]; tmpIdx ++){ 220 | 221 | int tmpCount = hostThetaA[tmpIdx]; 222 | int tmpK = hostThetaJA[tmpIdx]; 223 | if(tmpCount == 0)continue; 224 | 225 | totalCount += tmpCount; 226 | topicVec.push_back(FreqTuple(tmpK, tmpCount)); 227 | } 228 | 229 | sort(topicVec.begin(), topicVec.end()); 230 | 231 | //output 232 | docFileStream.width(4); 233 | docFileStream << docId << "| " << totalCount << " "; 234 | for(int i = 0;i < topicVec.size();i++){ 235 | docFileStream << "(" << topicVec[i].id << ","; 236 | docFileStream << topicVec[i].count << "), "; 237 | } 238 | docFileStream << endl; 239 | } 240 | 241 | } 242 | */ 243 | 244 | 245 | 246 | 247 | 248 | -------------------------------------------------------------------------------- /src_culda/model/model_theta.h: -------------------------------------------------------------------------------- 1 | #ifndef _GPU_THETA_H_ 2 | 3 | #define _GPU_THETA_H_ 4 | 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include "culda_argument.h" 13 | #include "vocab.h" 14 | #include "doc.h" 15 | 16 | #include "../kernel/lda_train_kernel.h" 17 | #include "../kernel/lda_theta_kernel.h" 18 | #include "model_theta_chunk.h" 19 | 20 | using namespace std; 21 | 22 | class ModelTheta 23 | { 24 | public: 25 | int k; 26 | int numDocs; 27 | int numWords; 28 | int numChunks; 29 | 30 | int thetaNNZ; 31 | 32 | vector thetaChunkVec; 33 | 34 | ModelTheta(); 35 | ModelTheta(int argK, int argDocs, int argWords, int argNumChunks); 36 | 37 | void InitData(Document&); 38 | void validTheta(Document&); 39 | void UpdateThetaGPU(Document &doc, cudaStream_t *stream=NULL); 40 | void clearPtr(); 41 | 42 | ~ModelTheta(){ clearPtr();} 43 | 44 | void toGPU(); 45 | void toCPU(); 46 | 47 | //float countZero(); 48 | //float countIA(); 49 | 50 | //void saveTheta(string fileName); 51 | //void saveDoc(std::string docFileName); 52 | 53 | }; 54 | 55 | 56 | #endif -------------------------------------------------------------------------------- /src_culda/model/model_theta_chunk.cpp: -------------------------------------------------------------------------------- 1 | /* Implementations of class ModelThetaChunk */ 2 | 3 | #include "model_theta_chunk.h" 4 | 5 | ModelThetaChunk::ModelThetaChunk(): 6 | k(0), 7 | numDocs(0), 8 | numWords(0), 9 | numChunks(0), 10 | chunkId(0), 11 | docIdStart(0), 12 | docIdEnd(0), 13 | chunkNumDocs(0), 14 | chunkNNZ(0), 15 | deviceThetaA(NULL), 16 | deviceThetaJA(NULL), 17 | deviceThetaCurIA(NULL), 18 | deviceThetaMaxIA(NULL), 19 | deviceDenseTheta(NULL), 20 | hostThetaA(NULL), 21 | hostThetaJA(NULL), 22 | hostThetaCurIA(NULL), 23 | hostThetaMaxIA(NULL) 24 | { 25 | clearPtr(); 26 | } 27 | 28 | ModelThetaChunk::ModelThetaChunk( 29 | int argK, 30 | int argNumDocs, 31 | int argNumWords, 32 | int argNumChunks, 33 | int argChunkId, 34 | int argDocIdStart, 35 | int argDocIdEnd, 36 | int argChunkNumDocs, 37 | int argChunkNNZ): 38 | k(argK), 39 | numDocs(argNumDocs), 40 | numWords(argNumWords), 41 | numChunks(argNumChunks), 42 | chunkId(argChunkId), 43 | docIdStart(argDocIdStart), 44 | docIdEnd(argDocIdEnd), 45 | chunkNumDocs(argChunkNumDocs), 46 | chunkNNZ(argChunkNNZ), 47 | deviceThetaA(NULL), 48 | deviceThetaJA(NULL), 49 | deviceThetaCurIA(NULL), 50 | deviceThetaMaxIA(NULL), 51 | deviceDenseTheta(NULL), 52 | hostThetaA(NULL), 53 | hostThetaJA(NULL), 54 | hostThetaCurIA(NULL), 55 | hostThetaMaxIA(NULL) 56 | { 57 | clearPtr(); 58 | } 59 | 60 | 61 | 62 | void ModelThetaChunk::InitData(const vector &docLenVec) 63 | { 64 | //alloc space 65 | hostThetaA = new short[chunkNNZ]; 66 | hostThetaJA = new short[chunkNNZ]; 67 | hostThetaCurIA = new int[numDocs]; 68 | hostThetaMaxIA = new int[numDocs + 1]; 69 | 70 | //CPU side 71 | memset(hostThetaMaxIA, 0, sizeof(int)*(numDocs + 1)); 72 | 73 | int offset = 0; 74 | for(int docId = docIdStart; docId < docIdEnd; docId ++){ 75 | hostThetaMaxIA[docId] = offset; 76 | hostThetaMaxIA[docId + 1] = offset + docLenVec[docId]; 77 | offset += docLenVec[docId]; 78 | } 79 | for(int docId = docIdEnd; docId <= numDocs ;docId ++) 80 | hostThetaMaxIA[docId] = offset; 81 | 82 | cudaSetDevice(chunkId); 83 | //GPU side 84 | cudaMalloc((void**)&deviceThetaA, sizeof(short)*chunkNNZ); 85 | cudaMalloc((void**)&deviceThetaJA, sizeof(short)*chunkNNZ); 86 | cudaMalloc((void**)&deviceThetaMaxIA, sizeof(int)*(numDocs + 1)); 87 | cudaMalloc((void**)&deviceThetaCurIA, sizeof(int)*numDocs); 88 | cudaMalloc((void**)&deviceDenseTheta, sizeof(int)*UpdateNumWorkers*k); 89 | 90 | long long totalByte = sizeof(short)*chunkNNZ + 91 | sizeof(short)*chunkNNZ + 92 | sizeof(int)*(numDocs + 1) + 93 | sizeof(int)*numDocs + 94 | sizeof(int)*UpdateNumWorkers*k; 95 | printf("theta chunk size:%.3f GB\n",totalByte/(1024.0*1024.0*1024.0)); 96 | 97 | //exit(0); 98 | 99 | cudaDeviceSynchronize(); 100 | gpuErr(cudaPeekAtLastError()); 101 | 102 | //transfer MaxIA 103 | toGPU(); 104 | } 105 | 106 | void ModelThetaChunk::UpdateThetaGPU(Document &doc, cudaStream_t stream) 107 | { 108 | 109 | cudaSetDevice(chunkId); 110 | 111 | cudaDeviceSynchronize(); 112 | gpuErr(cudaPeekAtLastError()); 113 | 114 | cudaMemsetAsync(deviceThetaA, 0, sizeof(short)*chunkNNZ, stream); 115 | cudaMemsetAsync(deviceThetaJA, 0, sizeof(short)*chunkNNZ, stream); 116 | cudaDeviceSynchronize(); 117 | gpuErr(cudaPeekAtLastError()); 118 | 119 | LDAUpdateThetaAPI( 120 | k, 121 | numDocs, 122 | chunkNumDocs, 123 | docIdStart, 124 | docIdEnd, 125 | doc.docChunkVec[chunkId]->deviceWordIndices, 126 | doc.docChunkVec[chunkId]->deviceWordTokens, 127 | doc.docChunkVec[chunkId]->deviceWordTopics, 128 | doc.docChunkVec[chunkId]->deviceDocRevIndices, 129 | doc.docChunkVec[chunkId]->deviceDocRevIdx, 130 | deviceThetaA, 131 | deviceThetaCurIA, 132 | deviceThetaMaxIA, 133 | deviceThetaJA, 134 | deviceDenseTheta, 135 | stream 136 | ); 137 | 138 | cudaDeviceSynchronize(); 139 | gpuErr(cudaPeekAtLastError()); 140 | } 141 | 142 | void ModelThetaChunk::validTheta(Document &doc) 143 | { 144 | 145 | printf("Calling ModelThetaChunk::validTheta() for chunk %d ...\n", chunkId); 146 | clock_t clockStart = clock(); 147 | 148 | 149 | printf("theta zero check\n"); 150 | for(int docId = docIdStart; docId < docIdEnd; docId ++) 151 | { 152 | 153 | int foundFlag = 0, errorFlag = 0; 154 | long long tmpStart = hostThetaMaxIA[docId]; 155 | long long tmpEnd = hostThetaMaxIA[docId + 1]; 156 | 157 | if(tmpStart%32 != 0){ 158 | printf("tmpStart non-aligned error\n"); 159 | exit(0); 160 | 161 | } 162 | if(tmpEnd%32 != 0){ 163 | printf("tmpEnd non-aligned error\n"); 164 | exit(0); 165 | } 166 | 167 | for(long long tmpIdx = tmpStart; tmpIdx < tmpEnd; tmpIdx ++){ 168 | 169 | if(hostThetaJA[tmpIdx] == 0 && hostThetaA[tmpIdx] != 0) 170 | { 171 | if(foundFlag == 1)errorFlag = 1; 172 | else foundFlag = 1; 173 | } 174 | } 175 | 176 | if(errorFlag == 1) 177 | { 178 | printf("error in validTheta\n"); 179 | 180 | for(long long tmpIdx = tmpStart; tmpIdx < tmpEnd; tmpIdx ++){ 181 | printf("IA(%lld), JA(%d), A(%d)\n", tmpIdx, hostThetaJA[tmpIdx], hostThetaA[tmpIdx]); 182 | } 183 | exit(0); 184 | } 185 | } 186 | 187 | 188 | //doc by doc check 189 | int *tmpThetaArray = new int[k](); 190 | int *tmpMask = new int[k](); 191 | for(int docId = docIdStart;docId < docIdEnd; docId ++){ 192 | 193 | //generate the dense array 194 | for(int i = 0;i < k;i++){ 195 | tmpThetaArray[i] = 0; 196 | tmpMask[i] = 0; 197 | } 198 | 199 | for(long long revIdx = doc.docChunkVec[chunkId]->docRevIndices[docId]; 200 | revIdx < doc.docChunkVec[chunkId]->docRevIndices[docId + 1]; 201 | revIdx ++){ 202 | 203 | int tokenIdx = doc.docChunkVec[chunkId]->docRevIdx[revIdx]; 204 | int tmpTopic = doc.docChunkVec[chunkId]->wordTopics[tokenIdx]; 205 | tmpThetaArray[tmpTopic] ++; 206 | } 207 | 208 | long long tmpStart = hostThetaMaxIA[docId]; 209 | long long tmpEnd = hostThetaMaxIA[docId + 1]; 210 | 211 | //round 1, check non-zero element 212 | for(long long tmpIdx = tmpStart; tmpIdx < tmpEnd; tmpIdx ++){ 213 | int tmpK = hostThetaJA[tmpIdx]; 214 | int tmpVal = hostThetaA[tmpIdx]; 215 | 216 | if(tmpVal == 0)continue; 217 | 218 | tmpMask[tmpK] = 1; 219 | if(tmpThetaArray[tmpK] != tmpVal){ 220 | printf("ValidTheta Error 1: docId(%d), topic(%d), tmpTheta(%d), theta(%d), IA(%lld)\n", 221 | docId, 222 | tmpK, 223 | tmpThetaArray[tmpK], 224 | tmpVal, 225 | tmpIdx); 226 | 227 | printf("tmpTheta:\n"); 228 | 229 | for(int j = 0;j < k/32;j ++){ 230 | printf("%2d:",j); 231 | for(int m = 0;m < 32;m ++){ 232 | printf("%d,",tmpThetaArray[j*32 + m]); 233 | } 234 | printf("\n"); 235 | } 236 | exit(0); 237 | } 238 | } 239 | 240 | //round2, check zero element 241 | for(int tmpK = 0;tmpK < k; tmpK ++){ 242 | if(tmpMask[tmpK] == 1)continue; 243 | 244 | if(tmpThetaArray[tmpK] != 0){ 245 | printf("ValidTheta Error 2: docId(%d), topic(%d), val(%d)\n", 246 | docId, tmpK, tmpThetaArray[tmpK]); 247 | 248 | printf("docId:(%d)\n", docId); 249 | for(int idx = hostThetaMaxIA[docId]; idx < hostThetaMaxIA[docId +1]; idx ++){ 250 | printf("IA(%d), JA(%d), A(%d)\n", idx, hostThetaJA[idx], hostThetaA[idx]); 251 | 252 | if(hostThetaJA[idx] == 0 && hostThetaA[idx] == 0)break; 253 | } 254 | 255 | printf("tmpTheta:\n"); 256 | for(int j = 0;j < k/32;j ++){ 257 | printf("%2d:",j); 258 | for(int m = 0;m < 32;m ++){ 259 | printf("%d,",tmpThetaArray[j*32 + m]); 260 | } 261 | printf("\n"); 262 | } 263 | exit(0); 264 | } 265 | } 266 | } 267 | 268 | printf("Validate Theta passed ...\n"); 269 | printf("ValidateTheta time:%.2fs\n", (clock() - clockStart)/(double)CLOCKS_PER_SEC); 270 | 271 | delete []tmpThetaArray; 272 | delete []tmpMask; 273 | } 274 | 275 | void ModelThetaChunk::toGPU() 276 | { 277 | //printf("ModelThetaChunk(%d)::toGPU() ...\n", chunkId); 278 | 279 | cudaMemcpy(deviceThetaMaxIA, 280 | hostThetaMaxIA, 281 | sizeof(int)*(numDocs + 1), 282 | cudaMemcpyHostToDevice); 283 | //gpuErr(cudaPeekAtLastError()); 284 | 285 | //printf("ModelThetaChunk::toGPU() finished ...\n\n"); 286 | } 287 | 288 | void ModelThetaChunk::toCPU() 289 | { 290 | //printf("ModelThetaChunk::thetaToCPU() ...\n"); 291 | cudaDeviceSynchronize(); 292 | gpuErr(cudaPeekAtLastError()); 293 | 294 | //theta 295 | cudaMemcpy(hostThetaA, 296 | deviceThetaA, 297 | sizeof(short)*chunkNNZ, 298 | cudaMemcpyDeviceToHost); 299 | cudaDeviceSynchronize(); 300 | gpuErr(cudaPeekAtLastError()); 301 | 302 | cudaMemcpy(hostThetaJA, 303 | deviceThetaJA, 304 | sizeof(short)*chunkNNZ, 305 | cudaMemcpyDeviceToHost); 306 | 307 | cudaMemcpy(hostThetaMaxIA, 308 | deviceThetaMaxIA, 309 | sizeof(int)*(numDocs + 1), 310 | cudaMemcpyDeviceToHost); 311 | gpuErr(cudaPeekAtLastError()); 312 | 313 | cudaMemcpy(hostThetaCurIA, 314 | deviceThetaCurIA, 315 | sizeof(int)*numDocs, 316 | cudaMemcpyDeviceToHost); 317 | gpuErr(cudaPeekAtLastError()); 318 | 319 | cudaDeviceSynchronize(); 320 | gpuErr(cudaPeekAtLastError()); 321 | //printf("ModelThetaChunk::thetaToCPU() finished ...\n"); 322 | 323 | } 324 | 325 | void ModelThetaChunk::clearPtr() 326 | { 327 | if(deviceThetaA != NULL) cudaFree(deviceThetaA); 328 | if(deviceThetaJA != NULL) cudaFree(deviceThetaJA); 329 | if(deviceThetaMaxIA != NULL) cudaFree(deviceThetaMaxIA); 330 | if(deviceThetaCurIA != NULL) cudaFree(deviceThetaCurIA); 331 | if(deviceDenseTheta != NULL) cudaFree(deviceDenseTheta); 332 | 333 | //CPU data release 334 | if(hostThetaA != NULL) delete []hostThetaA; 335 | if(hostThetaJA != NULL) delete []hostThetaJA; 336 | if(hostThetaMaxIA != NULL) delete []hostThetaMaxIA; 337 | if(hostThetaCurIA != NULL) delete []hostThetaCurIA; 338 | 339 | deviceThetaA = NULL; 340 | deviceThetaJA = NULL; 341 | deviceThetaMaxIA = NULL; 342 | deviceThetaCurIA = NULL; 343 | deviceDenseTheta = NULL; 344 | 345 | hostThetaA = NULL; 346 | hostThetaJA = NULL; 347 | hostThetaMaxIA = NULL; 348 | hostThetaCurIA = NULL; 349 | } 350 | 351 | -------------------------------------------------------------------------------- /src_culda/model/model_theta_chunk.h: -------------------------------------------------------------------------------- 1 | #ifndef _GPU_THETA_CHUNK_H_ 2 | 3 | #define _GPU_THETA_CHUNK_H_ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include "culda_argument.h" 12 | #include "vocab.h" 13 | #include "doc.h" 14 | 15 | #include "../kernel/lda_train_kernel.h" 16 | #include "../kernel/lda_theta_kernel.h" 17 | 18 | using namespace std; 19 | 20 | class ModelThetaChunk 21 | { 22 | public: 23 | int k; 24 | int numDocs; 25 | int numWords; 26 | int numChunks; 27 | 28 | int chunkId; 29 | int docIdStart; 30 | int docIdEnd; 31 | int chunkNumDocs; 32 | 33 | int chunkNNZ; 34 | 35 | //GPU data 36 | short *deviceThetaA; //chunkNNZ 37 | short *deviceThetaJA; //chunkNNZ 38 | int *deviceThetaCurIA; //numDocs 39 | int *deviceThetaMaxIA; //numDocs + 1 40 | int *deviceDenseTheta; //chunkNumDocs + 1 41 | 42 | //CPU data 43 | short *hostThetaA; //chunkNNZ 44 | short *hostThetaJA; //chunkNNZ 45 | int *hostThetaCurIA; //numDocs 46 | int *hostThetaMaxIA; //numDocs + 1 47 | 48 | ModelThetaChunk(); 49 | ModelThetaChunk(int, int, int, int, int, int, int, int, int); 50 | 51 | 52 | void InitData(const vector &); 53 | void toGPU(); 54 | void toCPU(); 55 | void UpdateThetaGPU(Document &, cudaStream_t stream=0); 56 | void validTheta(Document&); 57 | 58 | void clearPtr(); 59 | }; 60 | 61 | #endif -------------------------------------------------------------------------------- /src_culda/model/vocab.cpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | #include "vocab.h" 12 | 13 | 14 | //Definitions of methods of class Voculary 15 | void Vocabulary::clear() 16 | { 17 | wordList.clear(); 18 | wordSet.clear(); 19 | } 20 | 21 | bool Vocabulary::hasWord(const string & token) 22 | { 23 | return wordSet.find(Word(token,0)) != wordSet.end(); 24 | } 25 | 26 | void Vocabulary::insertWord(const string & token) 27 | { 28 | if(!hasWord(token)){ 29 | Word tmpWord(token, wordList.size()); 30 | wordList.push_back(tmpWord); 31 | wordSet.insert(tmpWord); 32 | } 33 | } 34 | 35 | string Vocabulary::getWordById(int argId) 36 | { 37 | 38 | if(argId >= wordList.size())cout << "overflow:" << argId << endl; 39 | return wordList[argId].token; 40 | } 41 | 42 | int Vocabulary::getIdByWord(string argToken) 43 | { 44 | set::iterator setIte = wordSet.find(argToken); 45 | if(setIte == wordSet.end())return -1; 46 | else return setIte->id; 47 | } 48 | 49 | void Vocabulary::loadVocab(string fileName) 50 | { 51 | 52 | clear(); 53 | ifstream inputFile(fileName.c_str(), ios::in); 54 | 55 | if(!inputFile.is_open()){ 56 | cout << "Vocabulary file " << fileName << " open failed" << endl; 57 | exit(0); 58 | } 59 | 60 | string token; 61 | int id; 62 | while(inputFile >> token >> id){ 63 | insertWord(token); 64 | } 65 | } 66 | 67 | void Vocabulary::writeVocab(string fileName) 68 | { 69 | 70 | ofstream outputFile(fileName.c_str(), ios::out); 71 | 72 | if(!outputFile.is_open()){ 73 | cout << "Vocabulary file " << fileName << " open failed" << endl; 74 | exit(0); 75 | } 76 | 77 | for(int i = 0;i < wordList.size(); i++) 78 | outputFile << wordList[i].token << " " << wordList[i].id << endl; 79 | 80 | } 81 | 82 | void Vocabulary::printVocabAbbr() 83 | { 84 | printf("----vocab info-----\n"); 85 | printf("numWords:%d\n", wordList.size()); 86 | 87 | } 88 | 89 | void Vocabulary::printVocabFull() 90 | { 91 | 92 | for(int i = 0;i < wordList.size();i++) 93 | cout << "(" << wordList[i].token << "," << wordList[i].id << ")" << endl; 94 | 95 | } 96 | 97 | Vocabulary::Vocabulary(const string &fname) 98 | { 99 | loadVocab(fname); 100 | } 101 | 102 | 103 | -------------------------------------------------------------------------------- /src_culda/model/vocab.h: -------------------------------------------------------------------------------- 1 | #ifndef _VOCAB_H_ 2 | 3 | #define _VOCAB_H_ 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace std; 11 | 12 | class Word{ 13 | public: 14 | string token; 15 | int id; 16 | bool operator < (const Word & arg) const{ 17 | return token.compare(arg.token) < 0; 18 | } 19 | bool operator > (const Word & arg) const{ 20 | return token.compare(arg.token) > 0; 21 | } 22 | bool operator == (const Word & arg) const{ 23 | return token.compare(arg.token) == 0; 24 | } 25 | 26 | Word(){} 27 | Word(string argString){ token = argString;} 28 | Word(string argString, int argId){ token = argString; id = argId;} 29 | }; 30 | 31 | 32 | class Vocabulary 33 | { 34 | public: 35 | std::vector wordList; 36 | std::set wordSet; 37 | 38 | Vocabulary(){} 39 | Vocabulary(const string &fname); 40 | 41 | void clear(); 42 | bool hasWord(const string & token); 43 | void insertWord(const string & token); 44 | 45 | string getWordById(int argId); 46 | int getIdByWord(string argToken); 47 | 48 | void loadVocab(string fname); 49 | void writeVocab(string fname); 50 | 51 | void printVocabAbbr(); 52 | void printVocabFull(); 53 | }; 54 | 55 | 56 | #endif -------------------------------------------------------------------------------- /src_culda/train/MultiChunkMultiGPUequal.h: -------------------------------------------------------------------------------- 1 | #ifndef _MultiChunkMultiGPUequal_H_ 2 | #define _MultiChunkMultiGPUequal_H_ 3 | 4 | 5 | void static MultiChunkMultiGPUequal(Document &doc, Vocabulary &vocab, Argument &arg, 6 | ModelPhi &modelPhi, ModelTheta &modelTheta) 7 | { 8 | 9 | /* data preparation and transfer */ 10 | gpuErr(cudaPeekAtLastError()); 11 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++){ 12 | doc.docChunkVec[chunkId]->allocGPU(chunkId); 13 | doc.docChunkVec[chunkId]->toGPU(); 14 | } 15 | printf("\n"); 16 | 17 | /* modelphi */ 18 | modelPhi.InitData(doc); 19 | printf("doc.numChunks:%d\n", doc.numChunks); 20 | for(int i = 0; i < doc.numChunks; i++) 21 | modelPhi.UpdatePhiGPU(doc, i); 22 | 23 | cudaDeviceSynchronize(); 24 | gpuErr(cudaPeekAtLastError()); 25 | 26 | for(int i = 1;i < arg.numGPUs; i++){ 27 | modelPhi.MasterGPUCollect(i); 28 | modelPhi.MasterGPUReduce(); 29 | } 30 | for(int i = 1;i < arg.numGPUs; i++) 31 | modelPhi.MasterGPUDistribute(i); 32 | 33 | modelPhi.MasterGPUToCPU(); 34 | modelPhi.validPhi(doc); 35 | modelPhi.UpdatePhiHead(arg.beta); 36 | 37 | cudaDeviceSynchronize(); 38 | gpuErr(cudaPeekAtLastError()); 39 | 40 | /* model theta */ 41 | modelTheta.InitData(doc); //alloc GPU+CPU memory space. 42 | modelTheta.UpdateThetaGPU(doc); 43 | modelTheta.toCPU(); 44 | //modelTheta.validTheta(doc); 45 | 46 | cudaDeviceSynchronize(); 47 | gpuErr(cudaPeekAtLastError()); 48 | 49 | 50 | /* prepare the randstate, used for random sampling. */ 51 | 52 | 53 | int randStateSize = 256; 54 | curandState *deviceRandState[MaxNumGPU]; 55 | for(int i = 0;i < arg.numGPUs;i++){ 56 | cudaSetDevice(i); 57 | cudaMalloc(&deviceRandState[i], sizeof(curandState)*randStateSize); 58 | initRandState<<>>(deviceRandState[i]); 59 | } 60 | 61 | cudaStream_t mainStream[MaxNumGPU]; 62 | cudaStream_t branStream[MaxNumGPU]; 63 | pthreadArgTheta thetaArgs[MaxNumGPU]; 64 | pthread_t threads[MaxNumGPU]; 65 | for(int i = 0;i < arg.numGPUs;i++){ 66 | cudaSetDevice(i); 67 | cudaStreamCreate(&mainStream[i]); 68 | cudaStreamCreate(&branStream[i]); 69 | 70 | thetaArgs[i].mainStream = mainStream[i]; 71 | thetaArgs[i].branStream = branStream[i]; 72 | thetaArgs[i].thetaPtr = &modelTheta; 73 | thetaArgs[i].id = i; 74 | thetaArgs[i].docPtr = &doc; 75 | } 76 | 77 | 78 | 79 | cudaDeviceSynchronize(); 80 | gpuErr(cudaPeekAtLastError()); 81 | 82 | //launch kernels 83 | 84 | struct timespec begin, end; 85 | double elapsed = 0, stamp = 0; 86 | 87 | printf("Launching Sampling Part ...\n"); 88 | for(int ite = 0;ite < arg.iteration; ite++){ 89 | 90 | //printf("Iteration %3d:", ite + 1); 91 | 92 | clock_gettime(CLOCK_MONOTONIC, &begin); 93 | 94 | 95 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++){ 96 | 97 | cudaSetDevice(chunkId); 98 | 99 | cudaDeviceSynchronize(); 100 | gpuErr(cudaPeekAtLastError()); 101 | 102 | //LDAKernelTrain<<>>( 103 | LDAKernelTrain<<numSlots, TrainBlockSize, 0, mainStream[chunkId]>>>( 104 | arg.k, 105 | arg.alpha, 106 | arg.beta, 107 | doc.numDocs, 108 | doc.numWords, 109 | doc.docChunkVec[chunkId]->chunkNumTokens, 110 | doc.docChunkVec[chunkId]->deviceWordIndices, 111 | doc.docChunkVec[chunkId]->deviceSlotIdToWordId, 112 | doc.docChunkVec[chunkId]->deviceSlotIndices, 113 | doc.docChunkVec[chunkId]->deviceWordTokens, 114 | doc.docChunkVec[chunkId]->deviceWordTopics, 115 | modelTheta.thetaChunkVec[chunkId]->deviceThetaA, 116 | modelTheta.thetaChunkVec[chunkId]->deviceThetaMaxIA, 117 | modelTheta.thetaChunkVec[chunkId]->deviceThetaCurIA, 118 | modelTheta.thetaChunkVec[chunkId]->deviceThetaJA, 119 | modelTheta.thetaChunkVec[chunkId]->docIdStart, 120 | modelPhi.phiChunkVec[chunkId]->devicePhiTopicWordShort, 121 | modelPhi.phiChunkVec[chunkId]->devicePhiTopic, 122 | modelPhi.phiChunkVec[chunkId]->devicePhiHead, 123 | deviceRandState[chunkId], 124 | randStateSize, 125 | chunkId, 126 | doc.docChunkVec[chunkId]->deviceWordPerplexity, 127 | doc.docChunkVec[chunkId]->deviceDocRevIndices 128 | ); 129 | } 130 | for(int i = 0;i < arg.numGPUs; i++) 131 | modelPhi.UpdatePhiGPU(doc, i, mainStream[i]); 132 | 133 | 134 | double logLike = LDATrainPerplexity(doc, mainStream); 135 | //printf("log likelyhood :%.8f\n", logLike); 136 | 137 | for(int i = 0;i < arg.numGPUs;i++){ 138 | pthread_create(&(threads[i]), 139 | NULL, 140 | UpdateThetaThread, 141 | (void*)(&(thetaArgs[i]))); 142 | //pthread_join(threads[i], NULL); 143 | } 144 | 145 | 146 | for(int i = 1;i < arg.numGPUs; i++){ 147 | cudaStreamSynchronize(mainStream[i]); 148 | modelPhi.MasterGPUCollect(i, mainStream[0]); 149 | modelPhi.MasterGPUReduce(mainStream[0]); 150 | } 151 | 152 | cudaStreamSynchronize(mainStream[0]); 153 | 154 | for(int i = 1;i < arg.numGPUs; i++) 155 | modelPhi.MasterGPUDistribute(i, mainStream[i]); 156 | 157 | modelPhi.UpdatePhiHead(arg.beta, mainStream); 158 | 159 | for(int i = 0;i < arg.numGPUs;i++) 160 | pthread_join(threads[i], NULL); 161 | 162 | cudaDeviceSynchronize(); 163 | 164 | clock_gettime(CLOCK_MONOTONIC, &end); 165 | stamp = end.tv_sec - begin.tv_sec; 166 | stamp += (end.tv_nsec - begin.tv_nsec) / 1000000000.0; 167 | elapsed += stamp; 168 | 169 | printf("Iteration, %d,%.3f sec,%.3f sec, %.8f, %.3f M\n", ite+1,elapsed, stamp, logLike, doc.numTokens/stamp/1000000); 170 | 171 | if((ite + 1)%20 == 0) sleep(60); 172 | } 173 | 174 | /* 175 | cudaDeviceSynchronize(); 176 | gpuErr(cudaPeekAtLastError()); 177 | 178 | 179 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++) 180 | doc.docChunkVec[chunkId]->toCPU(); 181 | printf("\n"); 182 | modelTheta.toCPU(); 183 | modelTheta.validTheta(doc); 184 | 185 | //modelPhi.MasterGPUToCPU(); 186 | //modelPhi.validPhi(doc); 187 | 188 | for(int i = 0;i < arg.numChunks; i++)cudaFree(deviceRandState[i]); 189 | cudaDeviceSynchronize(); 190 | gpuErr(cudaPeekAtLastError()); 191 | */ 192 | } 193 | 194 | #endif 195 | -------------------------------------------------------------------------------- /src_culda/train/SingleChunkSingleGPU.h: -------------------------------------------------------------------------------- 1 | #ifndef _SingleChunkSingleGPU_H_ 2 | #define _SingleChunkSingleGPU_H_ 3 | 4 | 5 | void static SingleChunkSingleGPU(Document &doc, Vocabulary &vocab, Argument &arg, 6 | ModelPhi &modelPhi, ModelTheta &modelTheta) 7 | { 8 | 9 | /* data preparation and transfer */ 10 | 11 | printf("Call SingleChunkSingleGPU() ...\n"); 12 | 13 | 14 | printf("alloc gpu for doc ...\n"); 15 | doc.docChunkVec[0]->allocGPU(0); 16 | printf("to gpu for doc ...\n"); 17 | doc.docChunkVec[0]->toGPU(); 18 | 19 | cudaDeviceSynchronize(); 20 | gpuErr(cudaPeekAtLastError()); 21 | 22 | /* model phi */ 23 | printf("Prepare model phi ...\n"); 24 | modelPhi.InitData(doc); 25 | modelPhi.UpdatePhiGPU(doc, 0); 26 | modelPhi.UpdatePhiHead(arg.beta); 27 | //modelPhi.MasterGPUToCPU(); 28 | //modelPhi.validPhi(doc); 29 | 30 | 31 | /* model theta */ 32 | printf("Prepare model theta ...\n"); 33 | cudaDeviceSynchronize(); 34 | gpuErr(cudaPeekAtLastError()); 35 | 36 | modelTheta.InitData(doc); 37 | 38 | cudaDeviceSynchronize(); 39 | gpuErr(cudaPeekAtLastError()); 40 | 41 | modelTheta.UpdateThetaGPU(doc); 42 | 43 | //exit(0); 44 | 45 | cudaDeviceSynchronize(); 46 | gpuErr(cudaPeekAtLastError()); 47 | 48 | //modelTheta.toCPU(); 49 | //modelTheta.validTheta(doc); 50 | 51 | //exit(0); 52 | 53 | 54 | /* prepare the randstate */ 55 | int randStateSize = 256*20; 56 | curandState *deviceRandState[MaxNumGPU]; 57 | cudaMalloc(&deviceRandState[0], sizeof(curandState)*randStateSize); 58 | initRandState<<>>(deviceRandState[0]); 59 | 60 | cudaStream_t extraStream; 61 | cudaStreamCreate(&extraStream); 62 | 63 | cudaDeviceSynchronize(); 64 | gpuErr(cudaPeekAtLastError()); 65 | 66 | struct timespec begin, end; 67 | double elapsed = 0, stamp = 0; 68 | 69 | //launch train kernels 70 | for(int ite = 0;ite < arg.iteration; ite++) 71 | { 72 | clock_gettime(CLOCK_MONOTONIC, &begin); 73 | 74 | //numBlocks = 100; 75 | LDAKernelTrain<<numSlots, TrainBlockSize>>>( 76 | arg.k, 77 | arg.alpha, 78 | arg.beta, 79 | doc.numDocs, 80 | doc.numWords, 81 | doc.docChunkVec[0]->chunkNumTokens, 82 | doc.docChunkVec[0]->deviceWordIndices, 83 | doc.docChunkVec[0]->deviceSlotIdToWordId, 84 | doc.docChunkVec[0]->deviceSlotIndices, 85 | doc.docChunkVec[0]->deviceWordTokens, 86 | doc.docChunkVec[0]->deviceWordTopics, 87 | modelTheta.thetaChunkVec[0]->deviceThetaA, 88 | modelTheta.thetaChunkVec[0]->deviceThetaMaxIA, 89 | modelTheta.thetaChunkVec[0]->deviceThetaCurIA, 90 | modelTheta.thetaChunkVec[0]->deviceThetaJA, 91 | modelTheta.thetaChunkVec[0]->docIdStart, 92 | modelPhi.phiChunkVec[0]->devicePhiTopicWordShort, 93 | modelPhi.phiChunkVec[0]->devicePhiTopic, 94 | modelPhi.phiChunkVec[0]->devicePhiHead, 95 | deviceRandState[0], 96 | randStateSize, //arg.numWorkers, 97 | 0, 98 | doc.docChunkVec[0]->deviceWordPerplexity, 99 | doc.docChunkVec[0]->deviceDocRevIndices 100 | ); 101 | 102 | //cudaDeviceSynchronize(); 103 | //gpuErr(cudaPeekAtLastError()); 104 | 105 | double logLike = LDATrainPerplexity(doc); 106 | //cudaDeviceSynchronize(); 107 | //gpuErr(cudaPeekAtLastError()); 108 | 109 | //doc.docChunkVec[0]->toCPU(); 110 | 111 | modelPhi.UpdatePhiGPU(doc, 0); 112 | modelPhi.UpdatePhiHead(arg.beta); 113 | //modelPhi.MasterGPUToCPU(); 114 | //modelPhi.validPhi(doc); 115 | 116 | modelTheta.UpdateThetaGPU(doc); 117 | //modelTheta.toCPU(); 118 | //modelTheta.validTheta(doc); 119 | 120 | cudaDeviceSynchronize(); 121 | 122 | clock_gettime(CLOCK_MONOTONIC, &end); 123 | stamp = end.tv_sec - begin.tv_sec; 124 | stamp += (end.tv_nsec - begin.tv_nsec) / 1000000000.0; 125 | elapsed += stamp; 126 | 127 | printf("Iteration %3d: %6.2f sec, %3.2f sec, logLikelyhood = %.8f, %5.3f M\n", ite+1,elapsed, stamp, logLike, doc.numTokens/stamp/1000000); 128 | cudaDeviceSynchronize(); 129 | gpuErr(cudaPeekAtLastError()); 130 | 131 | // if((ite + 1)%30 == 0)sleep(120); 132 | 133 | } 134 | 135 | cudaDeviceSynchronize(); 136 | gpuErr(cudaPeekAtLastError()); 137 | 138 | for(int chunkId = 0; chunkId < doc.numChunks; chunkId ++) 139 | doc.docChunkVec[chunkId]->toCPU(); 140 | printf("\n"); 141 | 142 | cudaDeviceSynchronize(); 143 | gpuErr(cudaPeekAtLastError()); 144 | 145 | modelTheta.toCPU(); 146 | //modelTheta.validTheta(doc); 147 | 148 | modelPhi.MasterGPUToCPU(); 149 | //cudaDeviceSynchronize(); 150 | //modelPhi.validPhi(doc); 151 | 152 | 153 | 154 | cudaDeviceSynchronize(); 155 | gpuErr(cudaPeekAtLastError()); 156 | //modelPhi.savePhi("phi.data"); 157 | } 158 | 159 | 160 | #endif 161 | -------------------------------------------------------------------------------- /src_culda/train/lda_train.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include 15 | 16 | #include "../model/vocab.h" 17 | #include "../model/doc.h" 18 | #include "lda_train.h" 19 | #include "../kernel/lda_train_kernel.h" 20 | 21 | 22 | struct pthreadArgTheta 23 | { 24 | int id; 25 | Document *docPtr; 26 | ModelTheta *thetaPtr; 27 | cudaStream_t mainStream; 28 | cudaStream_t branStream; 29 | }; 30 | 31 | static void *UpdateThetaThread(void *arg) 32 | { 33 | pthreadArgTheta *localArg = (pthreadArgTheta*)arg; 34 | cudaStreamSynchronize(localArg->mainStream); 35 | localArg->thetaPtr->thetaChunkVec[localArg->id]->UpdateThetaGPU(*(localArg->docPtr), localArg->branStream); 36 | 37 | return NULL; 38 | } 39 | 40 | #include "SingleChunkSingleGPU.h" 41 | #include "MultiChunkMultiGPUequal.h" 42 | //#include "MultiChunkMultiGPUNotequal.h" 43 | 44 | void LDATrain(Document &doc, Vocabulary &vocab, Argument &arg) 45 | { 46 | if(arg.numGPUs > doc.numChunks) arg.numGPUs = doc.numChunks; 47 | 48 | //ModelPhi preparation. 49 | ModelPhi modelPhi(arg.k, arg.numGPUs, doc.numDocs, doc.numWords, doc.numChunks); 50 | ModelTheta modelTheta(arg.k, doc.numDocs, doc.numWords, doc.numChunks); 51 | 52 | if(doc.numChunks == 1) //One chunk, one GPU. 53 | SingleChunkSingleGPU(doc, vocab, arg, modelPhi, modelTheta); 54 | else if(doc.numChunks != 1 && arg.numGPUs != 1 && arg.numGPUs == doc.numChunks) 55 | MultiChunkMultiGPUequal(doc, vocab, arg, modelPhi, modelTheta); 56 | 57 | } 58 | -------------------------------------------------------------------------------- /src_culda/train/lda_train.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _LDA_TRAIN_H_ 3 | #define _LDA_TRAIN_H_ 4 | 5 | 6 | #include "../model/model_theta.h" 7 | #include "../model/model_phi.h" 8 | #include "../model/culda_argument.h" 9 | #include "../model/vocab.h" 10 | #include "../model/doc.h" 11 | 12 | #include "../kernel/lda_train_kernel.h" 13 | 14 | void LDATrain(Document &doc, Vocabulary &vocab, Argument &argu); 15 | 16 | double LDALikelihood(Argument&, Document&, ModelTheta&, ModelPhi&); 17 | 18 | #endif -------------------------------------------------------------------------------- /src_format/.gitignore: -------------------------------------------------------------------------------- 1 | 2.idx 2 | data_format.o 3 | format 4 | format.o 5 | train.* 6 | vocab.* 7 | -------------------------------------------------------------------------------- /src_format/Makefile: -------------------------------------------------------------------------------- 1 | 2 | format: format.o vocab.o 3 | nvcc -o format format.o vocab.o 4 | 5 | vocab.o:../src_culda/vocab.cpp ../src_culda/vocab.h 6 | nvcc -c ../src_culda/vocab.cpp -o vocab.o 7 | 8 | format.o:format.cpp ../src_culda/vocab.h data_chunk.h 9 | nvcc -c format.cpp -o format.o 10 | 11 | clean: 12 | rm -f *.o format -------------------------------------------------------------------------------- /src_format/data_chunk.h: -------------------------------------------------------------------------------- 1 | #ifndef _DATA_CHUNK_H_ 2 | #define _DATA_CHUNK_H_ 3 | 4 | #include 5 | class DataChunk{ 6 | 7 | public: 8 | int numWords; 9 | int numChunks; 10 | int chunkId; 11 | 12 | int docIdStart; 13 | int docIdEnd; 14 | 15 | int chunkDocSize; 16 | long long chunkTokenSize; 17 | 18 | string outFilePrefix; 19 | string outFileIdxName; 20 | string outFileDataName; 21 | 22 | vector > wordFirstVec; 23 | 24 | DataChunk(int argNumWords, int argNumChunks, int argChunkId, string argFilePrefix) 25 | { 26 | numWords = argNumWords; 27 | numChunks = argNumChunks; 28 | chunkId = argChunkId; 29 | outFilePrefix = argFilePrefix; 30 | 31 | for(int i = 0;i < numWords;i++) 32 | wordFirstVec.push_back(vector()); 33 | 34 | stringstream tmpNameStream; 35 | outFileIdxName = outFilePrefix + ".word.idx"; 36 | outFileDataName = outFilePrefix + ".word.data"; 37 | if(numChunks > 1){ 38 | tmpNameStream << outFileIdxName << chunkId; 39 | tmpNameStream >> outFileIdxName; 40 | tmpNameStream.clear(); 41 | tmpNameStream << outFileDataName << chunkId; 42 | tmpNameStream >> outFileDataName; 43 | } 44 | 45 | docIdStart = 0; 46 | docIdEnd = 0; 47 | } 48 | 49 | void writeChunk() 50 | { 51 | ofstream outFileIdxStream(outFileIdxName.c_str(), ios::out); 52 | ofstream outFileDataStream(outFileDataName.c_str(), ios::out|ios::binary); 53 | 54 | long long offset = 0; 55 | for(int wordId = 0; wordId < wordFirstVec.size();wordId++){ 56 | offset += wordFirstVec[wordId].size(); 57 | outFileIdxStream << wordId << " " << offset << endl; 58 | 59 | for(int localTokenId = 0; localTokenId < wordFirstVec[wordId].size(); localTokenId ++){ 60 | int tmpWord = wordFirstVec[wordId][localTokenId]; 61 | outFileDataStream.write((char*)&(tmpWord), sizeof(int)); 62 | } 63 | } 64 | 65 | outFileIdxStream.close(); 66 | outFileDataStream.close(); 67 | } 68 | ~DataChunk() 69 | { 70 | } 71 | 72 | }; 73 | 74 | #endif -------------------------------------------------------------------------------- /src_format/format.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | //#include "../src_culda/data_format.h" 10 | #include "../src_culda/model/vocab.h" 11 | #include "../src_culda/model/culda_argument.h" 12 | 13 | 14 | #include "data_chunk.h" 15 | 16 | using namespace std; 17 | 18 | 19 | int main(int argc, char**argv) 20 | { 21 | 22 | int numChunks = 0; 23 | if(argc != 3 && argc != 4){ 24 | cout << "Usage:./format input output_prefix numChunks[default=1]" << endl; 25 | return 0; 26 | } 27 | if(argc == 4) numChunks = atoi(argv[3]); 28 | if(numChunks <= 1)numChunks = 1; 29 | 30 | //open file 31 | ifstream inputFile(argv[1], ios::in); 32 | 33 | if(!inputFile.is_open()){ 34 | cout << argv[1] << " open failed" << endl;; 35 | exit(0); 36 | } 37 | 38 | ofstream outFileVocab((argv[2] + string(".vocab")).c_str(), ios::out); 39 | ofstream outFileChunk((argv[2] + string(".chunk")).c_str(), ios::out); 40 | ofstream outFileDocIdx((argv[2] + string(".doc.idx")).c_str(), ios::out); 41 | 42 | if(!outFileVocab.is_open()){ 43 | cout << argv[2] << ".vocab open failed" << endl;; 44 | exit(0); 45 | } 46 | if(!outFileChunk.is_open()){ 47 | cout << argv[2] << ".chunk open failed" << endl;; 48 | exit(0); 49 | } 50 | if(!outFileDocIdx.is_open()){ 51 | cout << argv[2] << ".doc.idx open failed" << endl;; 52 | exit(0); 53 | } 54 | 55 | printf("reading input file ...\n"); 56 | //read input file 57 | Vocabulary vocab; 58 | long long numTokens = 0; 59 | vector > wordFirstVec; 60 | vector docLengthVec; 61 | int docId = 0; 62 | 63 | string docLine; 64 | while(getline(inputFile, docLine)){ 65 | 66 | string docTitle, token; 67 | 68 | std::istringstream docStream(docLine); 69 | if(!(docStream >> docTitle))continue; 70 | 71 | int docLength = 0; 72 | while(docStream >> token){ 73 | 74 | docLength ++; 75 | vocab.insertWord(token); 76 | int id = vocab.getIdByWord(token); 77 | numTokens ++; 78 | 79 | //insert it to wordFirstVec 80 | if(wordFirstVec.size() >= id){ 81 | 82 | int numLoops = id + 1 - wordFirstVec.size(); 83 | for(int i = 0;i < numLoops; i++) 84 | wordFirstVec.push_back(vector()); 85 | } 86 | wordFirstVec[id].push_back(docId); 87 | } 88 | outFileDocIdx << docId << " " << numTokens << endl; 89 | 90 | docLengthVec.push_back(docLength); 91 | docId ++; 92 | } 93 | inputFile.close(); 94 | 95 | printf("write vocabulary ...\n"); 96 | //write vocabulary 97 | for(int i = 0;i < vocab.wordList.size();i++){ 98 | outFileVocab << vocab.wordList[i].token << " " << vocab.wordList[i].id << endl; 99 | } 100 | outFileVocab.close(); 101 | 102 | //cout << "wordnumbers:" << vocab.word_list.size() << endl; 103 | //cout << "wordFirstVec:" << wordFirstVec.size() << endl; 104 | 105 | printf("mapping chunks ...\n"); 106 | //decide the doc -> chunk mapping 107 | long long tokenPerChunk = (numTokens + numChunks - 1)/numChunks; 108 | vector dataChunkVec; 109 | vector docToChunkVec; 110 | docId = 0; 111 | printf("numChunks:%lld\n", numChunks); 112 | printf("numTokens:%lld\n", numTokens); 113 | printf("perChunk :%lld\n", tokenPerChunk); 114 | for(int chunkId = 0; chunkId < numChunks; chunkId ++){ 115 | dataChunkVec.push_back(DataChunk(vocab.wordList.size(), numChunks, chunkId, argv[2])); 116 | 117 | long long tmpChunkSize = 0; 118 | dataChunkVec[chunkId].docIdStart = docId; 119 | while(docId < docLengthVec.size()){ 120 | tmpChunkSize += docLengthVec[docId]; 121 | docToChunkVec.push_back(chunkId); 122 | docId ++; 123 | 124 | if(tmpChunkSize >= tokenPerChunk)break; 125 | } 126 | dataChunkVec[chunkId].docIdEnd = docId; 127 | dataChunkVec[chunkId].chunkTokenSize = tmpChunkSize; 128 | dataChunkVec[chunkId].chunkDocSize = 129 | dataChunkVec[chunkId].docIdEnd - dataChunkVec[chunkId].docIdStart; 130 | 131 | outFileChunk << chunkId << " " << docId << endl; 132 | } 133 | 134 | for(int chunkId = 0;chunkId < numChunks; chunkId ++){ 135 | printf("----\n"); 136 | printf("chunkId:%d\n", chunkId); 137 | printf("numWords:%d\n", dataChunkVec[chunkId].numWords); 138 | printf("doc range:%d - %d\n", dataChunkVec[chunkId].docIdStart, dataChunkVec[chunkId].docIdEnd); 139 | printf("chunkSize:%lld\n", dataChunkVec[chunkId].chunkTokenSize); 140 | printf("%s\n", dataChunkVec[chunkId].outFileIdxName.c_str()); 141 | printf("%s\n", dataChunkVec[chunkId].outFileDataName.c_str()); 142 | } 143 | 144 | printf("chunk partitioning ...\n"); 145 | //distribute the data to each chunk 146 | long long offset = 0; 147 | for(int wordId = 0; wordId < wordFirstVec.size(); wordId ++){ 148 | 149 | for(int localTokenId = 0; localTokenId < wordFirstVec[wordId].size(); localTokenId ++){ 150 | 151 | int docId = wordFirstVec[wordId][localTokenId]; 152 | int chunkId = docToChunkVec[docId]; 153 | 154 | dataChunkVec[chunkId].wordFirstVec[wordId].push_back(docId); 155 | } 156 | } 157 | 158 | printf("write chunks ...\n"); 159 | //write 160 | for(int chunkId = 0;chunkId < numChunks;chunkId++) 161 | { 162 | printf("writing chunk %d ...\n", chunkId); 163 | dataChunkVec[chunkId].writeChunk(); 164 | } 165 | 166 | outFileChunk.close(); 167 | 168 | return 0; 169 | } --------------------------------------------------------------------------------