├── .gitmodules ├── .readthedocs.yml ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── cpp ├── include │ ├── culda │ │ ├── cuda_lda_kernels.cuh │ │ └── culda.hpp │ ├── cuw2v │ │ ├── cuda_w2v_base_kernels.cuh │ │ ├── cuda_w2v_hs_kernels.cuh │ │ ├── cuda_w2v_ns_kernels.cuh │ │ └── cuw2v.hpp │ └── utils │ │ ├── cuda_utils_kernels.cuh │ │ ├── ioutils.hpp │ │ ├── log.hpp │ │ └── types.hpp └── src │ ├── culda │ └── culda.cu │ ├── cuw2v │ └── cuw2v.cu │ └── utils │ ├── ioutils.cc │ └── log.cc ├── cuda_setup.py ├── cusim ├── .gitignore ├── __init__.py ├── aux.py ├── constants.py ├── culda │ ├── __init__.py │ ├── bindings.cc │ └── pyculda.py ├── cuw2v │ ├── __init__.py │ ├── bindings.cc │ └── pycuw2v.py ├── ioutils │ ├── __init__.py │ ├── bindings.cc │ └── pyioutils.py └── proto │ └── config.proto ├── docs ├── Makefile ├── conf.py ├── index.rst ├── install.rst ├── lda.rst ├── make.bat └── w2v.rst ├── examples ├── README.md ├── cusim.topics.txt ├── example_lda.py ├── example_w2v.py ├── gensim.topics.txt └── requirements.txt ├── pyproject.toml ├── requirements.txt └── setup.py /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "3rd/json11"] 2 | path = 3rd/json11 3 | url = https://github.com/dropbox/json11 4 | [submodule "3rd/spdlog"] 5 | path = 3rd/spdlog 6 | url = https://github.com/gabime/spdlog 7 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sphinx: 4 | builder: html 5 | configuration: docs/conf.py 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # reference: https://github.com/jeremad/cuda-travis/blob/master/.travis.yml 2 | language: cpp 3 | 4 | sudo: enabled 5 | 6 | compiler: 7 | - gcc 8 | 9 | matrix: 10 | include: 11 | - name: CUDA 10 12 | env: 13 | - CUDA=10.1.105-1 14 | - CUDA_SHORT=10.1 15 | - UBUNTU_VERSION=ubuntu1804 16 | dist: bionic 17 | 18 | before_install: 19 | - sudo apt update 20 | - sudo apt install -y software-properties-common 21 | - sudo add-apt-repository -y ppa:deadsnakes/ppa 22 | - sudo apt update 23 | - sudo apt install -y python3-pip python3.6 g++ 24 | - pip3 install -U pip 25 | - pip3 install setuptools 26 | - pip3 install -r requirements.txt 27 | - INSTALLER=cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb 28 | - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${INSTALLER} 29 | - sudo dpkg -i ${INSTALLER} 30 | - wget https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub 31 | - sudo apt-key add 7fa2af80.pub 32 | - sudo apt update -qq 33 | - sudo apt install -y cuda-core-${CUDA_SHORT/./-} cuda-cudart-dev-${CUDA_SHORT/./-} cuda-curand-dev-${CUDA_SHORT/./-} cuda-cufft-dev-${CUDA_SHORT/./-} 34 | - sudo apt clean 35 | - export CUDA_HOME=/usr/local/cuda-${CUDA_SHORT} 36 | - export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} 37 | - export PATH=${CUDA_HOME}/bin:${PATH} 38 | - python3.6 -m grpc_tools.protoc --python_out cusim/ --proto_path cusim/proto/ config.proto 39 | 40 | script: 41 | - sudo python3.6 setup.py install 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include cuda_setup.py 2 | include requirements.txt 3 | include pyproject.toml 4 | recursive-include cpp/src/cuw2v/ *.cu 5 | recursive-include cpp/src/culda/ *.cu 6 | recursive-include cpp/src/utils/ *.cc 7 | recursive-include cpp/include/cuw2v/ *.cuh 8 | recursive-include cpp/include/cuw2v/ *.hpp 9 | recursive-include cpp/include/culda/ *.cuh 10 | recursive-include cpp/include/culda/ *.hpp 11 | recursive-include cpp/include/utils/ *.cuh 12 | recursive-include cpp/include/utils/ *.hpp 13 | recursive-include 3rd/json11/ * 14 | recursive-include 3rd/spdlog/ * 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CUSIM 2 | 3 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![Build Status](https://travis-ci.org/js1010/cusim.svg?branch=main)](https://travis-ci.org/js1010/cusim) [![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/dwyl/learn-travis/issues) [![Documentation Status](https://readthedocs.org/projects/cusim/badge/?version=latest)](https://cusim.readthedocs.io/en/latest/?badge=latest) 4 | 5 | Superfast CUDA implementation of Word2Vec and Latent Dirichlet Allocation (LDA) 6 | 7 | ### Introduction 8 | 9 | This project is to speed up various ML models (e.g. topic modeling, word embedding, etc) by CUDA. It would be nice to think of it as [gensim](https://github.com/RaRe-Technologies/gensim)'s GPU version project. As a starting step, I implemented the most widely used word embedding model, the [word2vec](https://arxiv.org/pdf/1301.3781.pdf) model, and the most representative topic model, the [LDA (Latent Dirichlet Allocation)](https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf) model. 10 | 11 | ### Requirements 12 | 13 | - Python3.6+ 14 | - gcc / g++ (>= 5.1 for c++14) 15 | - cuda >= 7.0 16 | - Tested on Ubuntu 18.04 / GCC 7.5 / CUDA 11.1 / Python 3.6 17 | 18 | ### How to install 19 | 20 | - install from pypi 21 | 22 | ```shell 23 | pip install cusim 24 | ``` 25 | 26 | 27 | - install from source 28 | 29 | ```shell 30 | # clone repo and submodules 31 | git clone git@github.com:js1010/cusim.git && cd cusim && git submodule update --init 32 | 33 | # install requirements 34 | pip install -r requirements.txt 35 | 36 | # generate proto 37 | python -m grpc_tools.protoc --python_out cusim/ --proto_path cusim/proto/ config.proto 38 | 39 | # install 40 | python setup.py install 41 | ``` 42 | 43 | ### How to use 44 | 45 | - `examples/example_w2v.py`, `examples/example_lda.py` and `examples/README.md` will be very helpful to understand the usage. 46 | - paremeter description can be seen in `cusim/proto/config.proto` 47 | 48 | ### Performance 49 | 50 | - [AWS g4dn 2xlarge instance](https://aws.amazon.com/ec2/instance-types/g4/) is used to the experiment. (One NVIDIA T4 GPU with 8 vcpus, Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz) 51 | - results can be reproduced by simply running `examples/example_w2v.py` and `examples/example_lda.py` 52 | - To evaluate w2v model, I used `evaluate_word_pairs` function ([ref link](https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#evaluating)) in gensim, note that better performance on WS-353 test set does not necessarily mean that the model will workbetter in application as desribed on the link. However, it is good to be measured quantitively and fast training time will be at least very objective measure of the performaance. 53 | - I trained W2V model on `quora-duplicate-questions` dataset from gensim downloader api on GPU with cusim and compare the performance (both speed and model quality) with gensim. 54 | - To evaluate LDA model, I found there is no good way to measure the quality of traing results quantitatively. But we can check the model by looking at the top words of each topic. Also, we can compare the training time quantitatively. 55 | - W2V (skip gram, hierarchical softmax) 56 | 57 | | attr | 1 workers (gensim) | 2 workers (gensim) | 4 workers (gensim) | 8 workers (gensim) | NVIDIA T4 (cusim) | 58 | |:--------------------|---------------------:|---------------------:|---------------------:|---------------------:|--------------------:| 59 | | training time (sec) | 892.596 | 544.212 | 310.727 | 226.472 | **16.162** | 60 | | pearson | 0.487832 | 0.487696 | 0.482821 | 0.487136 | **0.492101** | 61 | | spearman | 0.500846 | 0.506214 | 0.501048 | **0.506718** | 0.479468 | 62 | 63 | - W2V (skip gram, negative sampling) 64 | 65 | | attr | 1 workers (gensim) | 2 workers (gensim) | 4 workers (gensim) | 8 workers (gensim) | NVIDIA T4 (cusim) | 66 | |:--------------------|---------------------:|---------------------:|---------------------:|---------------------:|--------------------:| 67 | | training time (sec) | 586.545 | 340.489 | 220.804 | 146.23 | **33.9173** | 68 | | pearson | 0.354448 | 0.353952 | 0.352398 | 0.352925 | **0.360436** | 69 | | spearman | 0.369146 | 0.369365 | **0.370565** | 0.365822 | 0.355204 | 70 | 71 | - W2V (CBOW, hierarchical softmax) 72 | 73 | | attr | 1 workers (gensim) | 2 workers (gensim) | 4 workers (gensim) | 8 workers (gensim) | NVIDIA T4 (cusim) | 74 | |:--------------------|---------------------:|---------------------:|---------------------:|---------------------:|--------------------:| 75 | | training time (sec) | 250.135 | 155.121 | 103.57 | 73.8073 | **6.20787** | 76 | | pearson | 0.309651 | 0.321803 | 0.324854 | 0.314255 | **0.480298** | 77 | | spearman | 0.294047 | 0.308723 | 0.318293 | 0.300591 | **0.480971** | 78 | 79 | - W2V (CBOW, negative sampling) 80 | 81 | | attr | 1 workers (gensim) | 2 workers (gensim) | 4 workers (gensim) | 8 workers (gensim) | NVIDIA T4 (cusim) | 82 | |:--------------------|---------------------:|---------------------:|---------------------:|---------------------:|--------------------:| 83 | | training time (sec) | 176.923 | 100.369 | 69.7829 | 49.9274 | **9.90391** | 84 | | pearson | 0.18772 | 0.193152 | 0.204509 | 0.187924 | **0.368202** | 85 | | spearman | 0.243975 | 0.24587 | 0.260531 | 0.237441 | **0.358042** | 86 | 87 | - LDA (`nytimes` dataset from https://archive.ics.uci.edu/ml/datasets/bag+of+words) 88 | - I found that setting `workers` variable in gensim LdaMulticore does not work properly (it uses all cores in instance anyway), so I just compared the speed between cusim with single GPU and gensim with 8 vcpus. 89 | - One can compare the quality of modeling by looking at `examples/cusim.topics.txt` and `examples/gensim.topics.txt`. 90 | 91 | | attr | gensim (8 vpus) | cusim (NVIDIA T4)| 92 | |:--------------------|------------------:|--------:| 93 | | training time (sec) | 447.376 | **76.6972** | 94 | 95 | ### Future tasks 96 | 97 | - support half precision 98 | - support multi device (multi device implementation on LDA model will not be that hard, while multi device training on w2v may require some considerations) 99 | - implement other models such as [FastText](https://arxiv.org/pdf/1607.04606.pdf), [BERT](https://arxiv.org/pdf/1810.04805.pdf), etc 100 | - **contribution is always welcome** 101 | -------------------------------------------------------------------------------- /cpp/include/culda/cuda_lda_kernels.cuh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | #include "utils/cuda_utils_kernels.cuh" 8 | 9 | 10 | namespace cusim { 11 | 12 | // reference: http://web.science.mq.edu.au/~mjohnson/code/digamma.c 13 | __inline__ __device__ 14 | float Digamma(float x) { 15 | float result = 0.0f, xx, xx2, xx4; 16 | for ( ; x < 7.0f; ++x) 17 | result -= 1.0f / x; 18 | x -= 0.5f; 19 | xx = 1.0f / x; 20 | xx2 = xx * xx; 21 | xx4 = xx2 * xx2; 22 | result += logf(x) + 1.0f / 24.0f * xx2 23 | - 7.0f / 960.0f * xx4 + 31.0f / 8064.0f * xx4 * xx2 24 | - 127.0f / 30720.0f * xx4 * xx4; 25 | return result; 26 | } 27 | 28 | __global__ void EstepKernel( 29 | const int* cols, const int* indptr, 30 | const bool* vali, const float* counts, 31 | const bool init_gamma, const int num_cols, const int num_indptr, 32 | const int num_topics, const int num_iters, 33 | const float* alpha, const float* beta, 34 | float* gamma, float* grad_alpha, float* new_beta, 35 | float* train_losses, float* vali_losses, int* locks) { 36 | 37 | // storage for block 38 | extern __shared__ float shared_memory[]; 39 | float* _new_gamma = &shared_memory[0]; 40 | float* _phi = &shared_memory[num_topics]; 41 | float* _loss_vec = &shared_memory[num_topics * 2]; 42 | float* _vali_phi_sum = &shared_memory[num_topics * 3]; 43 | 44 | float* _grad_alpha = grad_alpha + num_topics * blockIdx.x; 45 | 46 | for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) { 47 | int beg = indptr[i], end = indptr[i + 1]; 48 | float* _gamma = gamma + num_topics * i; 49 | if (init_gamma) { 50 | for (int j = threadIdx.x; j < num_topics; j += blockDim.x) { 51 | _gamma[j] = alpha[j] + (end - beg) / num_topics; 52 | } 53 | } 54 | __syncthreads(); 55 | 56 | // initiate phi sum for validation data for computing vali loss 57 | for (int j = threadIdx.x; j < num_topics; j += blockDim.x) 58 | _vali_phi_sum[j] = 0.0f; 59 | 60 | // iterate E step 61 | for (int j = 0; j < num_iters; ++j) { 62 | // initialize new gamma 63 | for (int k = threadIdx.x; k < num_topics; k += blockDim.x) 64 | _new_gamma[k] = 0.0f; 65 | __syncthreads(); 66 | 67 | // compute phi from gamma 68 | for (int k = beg; k < end; ++k) { 69 | const int w = cols[k]; 70 | const bool _vali = vali[k]; 71 | const float c = counts[k]; 72 | // compute phi 73 | if (not _vali or j + 1 == num_iters) { 74 | for (int l = threadIdx.x; l < num_topics; l += blockDim.x) 75 | _phi[l] = beta[w * num_topics + l] * expf(Digamma(_gamma[l])); 76 | __syncthreads(); 77 | 78 | // normalize phi and add it to new gamma and new beta 79 | float phi_sum = ReduceSum(_phi, num_topics); 80 | 81 | for (int l = threadIdx.x; l < num_topics; l += blockDim.x) { 82 | _phi[l] /= phi_sum; 83 | 84 | // update gamma for train data and phi_sum for computing loss 85 | if (_vali) 86 | _vali_phi_sum[l] += _phi[l] * c; 87 | else 88 | _new_gamma[l] += _phi[l] * c; 89 | 90 | } 91 | __syncthreads(); 92 | } 93 | 94 | if (j + 1 == num_iters) { 95 | // update beta for train data 96 | if (not _vali) { 97 | // write access of w th vector of new_beta 98 | if (threadIdx.x == 0) { 99 | while (atomicCAS(&locks[w], 0, 1)) {} 100 | } 101 | 102 | __syncthreads(); 103 | for (int l = threadIdx.x; l < num_topics; l += blockDim.x) 104 | new_beta[w * num_topics + l] += _phi[l] * c; 105 | __syncthreads(); 106 | 107 | // release lock 108 | if (threadIdx.x == 0) locks[w] = 0; 109 | __syncthreads(); 110 | } 111 | 112 | // comput loss and reset shared mem 113 | // see Eq (15) in https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf 114 | for (int l = threadIdx.x; l < num_topics; l += blockDim.x) { 115 | _loss_vec[l] = logf(fmaxf(beta[w * num_topics + l], EPS)); 116 | _loss_vec[l] -= logf(fmaxf(_phi[l], EPS)); 117 | _loss_vec[l] *= _phi[l]; 118 | } 119 | __syncthreads(); 120 | float _loss = ReduceSum(_loss_vec, num_topics) * c; 121 | if (threadIdx.x == 0) { 122 | if (_vali) 123 | vali_losses[blockIdx.x] += _loss; 124 | else 125 | train_losses[blockIdx.x] += _loss; 126 | } 127 | __syncthreads(); 128 | 129 | } 130 | __syncthreads(); 131 | } 132 | 133 | // update gamma 134 | for (int k = threadIdx.x; k < num_topics; k += blockDim.x) 135 | _gamma[k] = _new_gamma[k] + alpha[k]; 136 | __syncthreads(); 137 | } 138 | 139 | // update gradient of alpha and loss from E[log(theta)] 140 | float gamma_sum = ReduceSum(_gamma, num_topics); 141 | for (int j = threadIdx.x; j < num_topics; j += blockDim.x) { 142 | float Elogthetad = Digamma(_gamma[j]) - Digamma(gamma_sum); 143 | _grad_alpha[j] += Elogthetad; 144 | _new_gamma[j] *= Elogthetad; 145 | _vali_phi_sum[j] *= Elogthetad; 146 | } 147 | 148 | // see Eq (15) in https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf 149 | float train_loss = ReduceSum(_new_gamma, num_topics); 150 | float vali_loss = ReduceSum(_vali_phi_sum, num_topics); 151 | if (threadIdx.x == 0) { 152 | train_losses[blockIdx.x] += train_loss; 153 | vali_losses[blockIdx.x] += vali_loss; 154 | } 155 | 156 | __syncthreads(); 157 | } 158 | } 159 | 160 | } // cusim 161 | -------------------------------------------------------------------------------- /cpp/include/culda/culda.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include // NOLINT 28 | 29 | #include "json11.hpp" 30 | #include "utils/log.hpp" 31 | #include "utils/types.hpp" 32 | 33 | namespace cusim { 34 | 35 | 36 | // reference: https://people.math.sc.edu/Burkardt/cpp_src/asa121/asa121.cpp 37 | inline float Trigamma(float x) { 38 | const float a = 0.0001f; 39 | const float b = 5.0f; 40 | const float b2 = 0.1666666667f; 41 | const float b4 = -0.03333333333f; 42 | const float b6 = 0.02380952381f; 43 | const float b8 = -0.03333333333f; 44 | float value = 0, y = 0, z = x; 45 | if (x <= a) return 1.0f / x / x; 46 | while (z < b) { 47 | value += 1.0f / z / z; 48 | z++; 49 | } 50 | y = 1.0f / z / z; 51 | value += value + 0.5 * y + (1.0 52 | + y * (b2 53 | + y * (b4 54 | + y * (b6 55 | + y * b8)))) / z; 56 | return value; 57 | } 58 | 59 | 60 | class CuLDA { 61 | public: 62 | CuLDA(); 63 | ~CuLDA(); 64 | bool Init(std::string opt_path); 65 | void LoadModel(float* alpha, float* beta, 66 | float* grad_alpha, float* new_beta, const int num_words); 67 | std::pair FeedData( 68 | const int* indices, const int* indptr, 69 | const bool* vali, const float* counts, 70 | float* gamma, const bool init_gamma, 71 | const int num_indices, const int num_indptr, 72 | const int num_iters); 73 | void Pull(); 74 | void Push(); 75 | int GetBlockCnt(); 76 | 77 | private: 78 | DeviceInfo dev_info_; 79 | json11::Json opt_; 80 | std::shared_ptr logger_; 81 | std::unique_ptr logger_container_; 82 | thrust::device_vector dev_alpha_, dev_beta_; 83 | thrust::device_vector dev_grad_alpha_, dev_new_beta_; 84 | thrust::device_vector dev_locks_; 85 | 86 | float *alpha_, *beta_, *grad_alpha_, *new_beta_; 87 | int block_cnt_, block_dim_; 88 | int num_topics_, num_words_; 89 | }; 90 | 91 | } // namespace cusim 92 | -------------------------------------------------------------------------------- /cpp/include/cuw2v/cuda_w2v_base_kernels.cuh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | #include "utils/cuda_utils_kernels.cuh" 8 | 9 | #define MAX_EXP 20 10 | 11 | namespace cusim { 12 | 13 | 14 | __inline__ __device__ 15 | void PositiveFeedback(const float* vec1, float* vec2, float* grad, 16 | float& loss_nume, float& loss_deno, const int num_dims, const float lr) { 17 | static __shared__ float g; 18 | float dot = fmaxf(-MAX_EXP, fminf(MAX_EXP, Dot(vec1, vec2, num_dims))); 19 | if (threadIdx.x == 0) { 20 | float exp_dot = expf(-dot); 21 | g = exp_dot / (1 + exp_dot) * lr; 22 | loss_nume += logf(1 + exp_dot); 23 | loss_deno++; 24 | } 25 | __syncthreads(); 26 | for (int i = threadIdx.x; i < num_dims; i += blockDim.x) { 27 | grad[i] += vec2[i] * g; 28 | vec2[i] += vec1[i] * g; 29 | } 30 | __syncthreads(); 31 | } 32 | 33 | __inline__ __device__ 34 | void NegativeFeedback(const float* vec1, float* vec2, float* grad, 35 | float& loss_nume, float& loss_deno, const int num_dims, const float lr) { 36 | static __shared__ float g; 37 | float dot = fmaxf(-MAX_EXP, fminf(MAX_EXP, Dot(vec1, vec2, num_dims))); 38 | if (threadIdx.x == 0) { 39 | float exp_dot = expf(dot); 40 | g = exp_dot / (1 + exp_dot) * lr; 41 | loss_nume += logf(1 + exp_dot); 42 | loss_deno++; 43 | } 44 | __syncthreads(); 45 | for (int i = threadIdx.x; i < num_dims; i += blockDim.x) { 46 | grad[i] -= vec2[i] * g; 47 | vec2[i] -= vec1[i] * g; 48 | } 49 | __syncthreads(); 50 | } 51 | 52 | } // cusim 53 | -------------------------------------------------------------------------------- /cpp/include/cuw2v/cuda_w2v_hs_kernels.cuh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | #include "utils/cuda_utils_kernels.cuh" 8 | #include "cuw2v/cuda_w2v_base_kernels.cuh" 9 | 10 | 11 | namespace cusim { 12 | 13 | __global__ void W2VHsSgKernel( 14 | const int* cols, const int* indptr, 15 | const bool* codes, const int* points, const int* hs_indptr, 16 | const int num_indptr, const int num_dims, const int window_size, 17 | default_random_engine* rngs, 18 | float* emb_in, float* emb_out, 19 | float* loss_nume, float* loss_deno, const float lr) { 20 | 21 | default_random_engine& rng = rngs[blockIdx.x]; 22 | float& _loss_nume = loss_nume[blockIdx.x]; 23 | float& _loss_deno = loss_deno[blockIdx.x]; 24 | 25 | uniform_int_distribution dist_window(0, window_size - 1); 26 | static __shared__ int reduced_windows; 27 | extern __shared__ float shared_memory[]; 28 | float* grad = &shared_memory[0]; 29 | 30 | // zero-initialize shared mem 31 | for (int i = threadIdx.x; i < num_dims; i += blockDim.x) 32 | grad[i] = 0.0f; 33 | __syncthreads(); 34 | 35 | for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) { 36 | int beg = indptr[i], end = indptr[i + 1]; 37 | for (int j = beg; j < end; ++j) { 38 | if (threadIdx.x == 0) reduced_windows = dist_window(rng); 39 | __syncthreads(); 40 | int beg2 = max(beg, j - window_size + reduced_windows); 41 | int end2 = min(end, j + window_size - reduced_windows + 1); 42 | for (int k = beg2; k < end2; ++k) { 43 | if (k == j) continue; 44 | float* _emb_in = emb_in + num_dims * cols[k]; 45 | int beg3 = hs_indptr[cols[j]]; 46 | int end3 = hs_indptr[cols[j] + 1]; 47 | for (int l = beg3; l < end3; ++l) { 48 | if (codes[l]) { 49 | PositiveFeedback(_emb_in, emb_out + num_dims * points[l], 50 | grad, _loss_nume, _loss_deno, num_dims, lr); 51 | } else { 52 | NegativeFeedback(_emb_in, emb_out + num_dims * points[l], 53 | grad, _loss_nume, _loss_deno, num_dims, lr); 54 | } 55 | __syncthreads(); 56 | } 57 | for (int l = threadIdx.x; l < num_dims; l += blockDim.x) { 58 | _emb_in[l] += grad[l]; 59 | grad[l] = 0.0f; 60 | } 61 | __syncthreads(); 62 | } 63 | } 64 | } 65 | } 66 | 67 | __global__ void W2VHsCbowKernel( 68 | const int* cols, const int* indptr, 69 | const bool* codes, const int* points, const int* hs_indptr, 70 | const int num_indptr, const int num_dims, const int window_size, default_random_engine* rngs, 71 | float* emb_in, float* emb_out, 72 | float* loss_nume, float* loss_deno, 73 | const bool cbow_mean, const float lr) { 74 | 75 | default_random_engine& rng = rngs[blockIdx.x]; 76 | float& _loss_nume = loss_nume[blockIdx.x]; 77 | float& _loss_deno = loss_deno[blockIdx.x]; 78 | 79 | uniform_int_distribution dist_window(0, window_size - 1); 80 | static __shared__ int reduced_windows; 81 | extern __shared__ float shared_memory[]; 82 | float* grad = &shared_memory[0]; 83 | float* cbow = &shared_memory[num_dims]; 84 | 85 | __syncthreads(); 86 | 87 | for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) { 88 | int beg = indptr[i], end = indptr[i + 1]; 89 | for (int j = beg; j < end; ++j) { 90 | if (threadIdx.x == 0) reduced_windows = dist_window(rng); 91 | __syncthreads(); 92 | int beg2 = max(beg, j - window_size + reduced_windows); 93 | int end2 = min(end, j + window_size - reduced_windows + 1); 94 | if (end2 - beg2 <= 1) continue; 95 | 96 | // zero-initialize shared mem 97 | for (int k = threadIdx.x; k < num_dims; k += blockDim.x) { 98 | grad[k] = 0.0f; 99 | cbow[k] = 0.0f; 100 | } 101 | 102 | // compute cbow 103 | for (int k = beg2; k < end2; ++k) { 104 | if (k == j) continue; 105 | for (int l = threadIdx.x; l < num_dims; l += blockDim.x) { 106 | cbow[l] += emb_in[num_dims * cols[k] + l]; 107 | } 108 | } 109 | if (cbow_mean) { 110 | for (int k = threadIdx.x; k < num_dims; k += blockDim.x) { 111 | cbow[k] /= (end2 - beg2 - 1); 112 | } 113 | } 114 | __syncthreads(); 115 | 116 | int beg3 = hs_indptr[cols[j]]; 117 | int end3 = hs_indptr[cols[j] + 1]; 118 | for (int k = beg3; k < end3; ++k) { 119 | if (codes[k]) { 120 | PositiveFeedback(cbow, emb_out + num_dims * points[k], 121 | grad, _loss_nume, _loss_deno, num_dims, lr); 122 | } else { 123 | NegativeFeedback(cbow, emb_out + num_dims * points[k], 124 | grad, _loss_nume, _loss_deno, num_dims, lr); 125 | } 126 | __syncthreads(); 127 | } 128 | 129 | // normalize grad if cbow_mean = true 130 | if (cbow_mean) { 131 | for (int k = threadIdx.x; k < num_dims; k += blockDim.x) { 132 | grad[k] /= (end2 - beg2 - 1); 133 | } 134 | } 135 | __syncthreads(); 136 | 137 | // update emb_in 138 | for (int k = beg2; k < end2; ++k) { 139 | if (k == j) continue; 140 | for (int l = threadIdx.x; l < num_dims; l += blockDim.x) { 141 | emb_in[num_dims * cols[k] + l] += grad[l]; 142 | } 143 | __syncthreads(); 144 | } 145 | } 146 | } 147 | } 148 | 149 | } // cusim 150 | -------------------------------------------------------------------------------- /cpp/include/cuw2v/cuda_w2v_ns_kernels.cuh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | #include "utils/cuda_utils_kernels.cuh" 8 | #include "cuw2v/cuda_w2v_base_kernels.cuh" 9 | 10 | 11 | namespace cusim { 12 | 13 | __global__ void W2VNegSgKernel( 14 | const int* cols, const int* indptr, 15 | const int* random_table, default_random_engine* rngs, const int random_size, 16 | const int num_indptr, const int num_dims, const int neg, const int window_size, 17 | float* emb_in, float* emb_out, float* loss_nume, float* loss_deno, const float lr) { 18 | 19 | default_random_engine& rng = rngs[blockIdx.x]; 20 | float& _loss_nume = loss_nume[blockIdx.x]; 21 | float& _loss_deno = loss_deno[blockIdx.x]; 22 | 23 | uniform_int_distribution dist_neg(0, random_size - 1); 24 | uniform_int_distribution dist_window(0, window_size - 1); 25 | __shared__ int reduced_windows; 26 | __shared__ int neg_word; 27 | extern __shared__ float shared_memory[]; 28 | float* grad = &shared_memory[0]; 29 | 30 | // zero-initialize shared mem 31 | for (int i = threadIdx.x; i < num_dims; i += blockDim.x) 32 | grad[i] = 0.0f; 33 | __syncthreads(); 34 | 35 | for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) { 36 | int beg = indptr[i], end = indptr[i + 1]; 37 | for (int j = beg; j < end; ++j) { 38 | if (threadIdx.x == 0) reduced_windows = dist_window(rng); 39 | __syncthreads(); 40 | int beg2 = max(beg, j - window_size + reduced_windows); 41 | int end2 = min(end, j + window_size - reduced_windows + 1); 42 | for (int k = beg2; k < end2; ++k) { 43 | if (k == j) continue; 44 | float* _emb_in = emb_in + num_dims * cols[k]; 45 | PositiveFeedback(_emb_in, emb_out + num_dims * cols[j], 46 | grad, _loss_nume, _loss_deno, num_dims, lr); 47 | for (int l = 0; l < neg; ++l) { 48 | if (threadIdx.x == 0) neg_word = random_table[dist_neg(rng)]; 49 | __syncthreads(); 50 | if (neg_word == cols[j]) continue; 51 | NegativeFeedback(_emb_in, emb_out + num_dims * neg_word, 52 | grad, _loss_nume, _loss_deno, num_dims, lr); 53 | } 54 | __syncthreads(); 55 | for (int l = threadIdx.x; l < num_dims; l += blockDim.x) { 56 | _emb_in[l] += grad[l]; 57 | grad[l] = 0.0f; 58 | } 59 | __syncthreads(); 60 | } 61 | } 62 | } 63 | } 64 | 65 | __global__ void W2VNegCbowKernel( 66 | const int* cols, const int* indptr, 67 | const int* random_table, default_random_engine* rngs, const int random_size, 68 | const int num_indptr, const int num_dims, const int neg, const int window_size, 69 | float* emb_in, float* emb_out, 70 | float* loss_nume, float* loss_deno, const bool cbow_mean, const float lr) { 71 | 72 | default_random_engine& rng = rngs[blockIdx.x]; 73 | float& _loss_nume = loss_nume[blockIdx.x]; 74 | float& _loss_deno = loss_deno[blockIdx.x]; 75 | 76 | uniform_int_distribution dist_neg(0, random_size - 1); 77 | uniform_int_distribution dist_window(0, window_size - 1); 78 | static __shared__ int reduced_windows; 79 | static __shared__ int neg_word; 80 | extern __shared__ float shared_memory[]; 81 | float* grad = &shared_memory[0]; 82 | float* cbow = &shared_memory[num_dims]; 83 | 84 | __syncthreads(); 85 | 86 | for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) { 87 | int beg = indptr[i], end = indptr[i + 1]; 88 | for (int j = beg; j < end; ++j) { 89 | if (threadIdx.x == 0) reduced_windows = dist_window(rng); 90 | __syncthreads(); 91 | int beg2 = max(beg, j - window_size + reduced_windows); 92 | int end2 = min(end, j + window_size - reduced_windows + 1); 93 | if (end2 - beg2 <= 1) continue; 94 | 95 | // zero-initialize shared mem 96 | for (int k = threadIdx.x; k < num_dims; k += blockDim.x) { 97 | grad[k] = 0.0f; 98 | cbow[k] = 0.0f; 99 | } 100 | 101 | // compute cbow 102 | for (int k = beg2; k < end2; ++k) { 103 | if (k == j) continue; 104 | for (int l = threadIdx.x; l < num_dims; l += blockDim.x) { 105 | cbow[l] += emb_in[num_dims * cols[k] + l]; 106 | } 107 | } 108 | if (cbow_mean) { 109 | for (int k = threadIdx.x; k < num_dims; k += blockDim.x) { 110 | cbow[k] /= (end2 - beg2 - 1); 111 | } 112 | } 113 | __syncthreads(); 114 | 115 | PositiveFeedback(cbow, emb_out + num_dims * cols[j], grad, 116 | _loss_nume, _loss_deno, num_dims, lr); 117 | __syncthreads(); 118 | 119 | // update negative feedback 120 | for (int k = 0; k < neg; ++k){ 121 | if (threadIdx.x == 0) neg_word = random_table[dist_neg(rng)]; 122 | __syncthreads(); 123 | if (neg_word == cols[j]) continue; 124 | NegativeFeedback(cbow, emb_out + num_dims * neg_word, 125 | grad, _loss_nume, _loss_deno, num_dims, lr); 126 | } 127 | __syncthreads(); 128 | 129 | // normalize grad if cbow_mean = true 130 | if (cbow_mean) { 131 | for (int k = threadIdx.x; k < num_dims; k += blockDim.x) { 132 | grad[k] /= (end2 - beg2 - 1); 133 | } 134 | } 135 | __syncthreads(); 136 | 137 | // update emb_in 138 | for (int k = beg2; k < end2; ++k) { 139 | if (k == j) continue; 140 | for (int l = threadIdx.x; l < num_dims; l += blockDim.x) 141 | emb_in[num_dims * cols[k] + l] += grad[l]; 142 | } 143 | __syncthreads(); 144 | 145 | } 146 | } 147 | } 148 | 149 | } // cusim 150 | -------------------------------------------------------------------------------- /cpp/include/cuw2v/cuw2v.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include // NOLINT 28 | 29 | #include "json11.hpp" 30 | #include "utils/log.hpp" 31 | #include "utils/types.hpp" 32 | 33 | using thrust::random::default_random_engine; 34 | 35 | namespace cusim { 36 | 37 | class CuW2V { 38 | public: 39 | CuW2V(); 40 | ~CuW2V(); 41 | bool Init(std::string opt_path); 42 | void LoadModel(float* emb_in, float* emb_out); 43 | void BuildHuffmanTree(const float* word_count, const int num_words); 44 | void BuildRandomTable(const double* word_count, const int num_words, const int table_size); 45 | std::pair FeedData(const int* cols, const int* indptr, 46 | const int num_cols, const int num_indptr); 47 | void Pull(); 48 | 49 | private: 50 | DeviceInfo dev_info_; 51 | json11::Json opt_; 52 | std::shared_ptr logger_; 53 | std::unique_ptr logger_container_; 54 | int block_cnt_, block_dim_; 55 | int num_dims_, num_words_, window_size_; 56 | float *emb_in_, *emb_out_, lr_; 57 | thrust::device_vector dev_emb_in_, dev_emb_out_; 58 | 59 | // variables to construct huffman tree 60 | int max_depth_; 61 | thrust::device_vector dev_codes_; 62 | thrust::device_vector dev_points_, dev_hs_indptr_; 63 | 64 | // related to negative sampling / hierarchical softmax and skip gram / cbow 65 | bool sg_, cbow_mean_; 66 | int neg_; 67 | 68 | // variables to construct random table 69 | thrust::device_vector dev_random_table_; 70 | int random_size_, seed_; 71 | thrust::device_vector dev_rngs_; 72 | }; 73 | 74 | } // namespace cusim 75 | -------------------------------------------------------------------------------- /cpp/include/utils/cuda_utils_kernels.cuh: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | #include 8 | // #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "utils/types.hpp" 25 | 26 | using thrust::random::default_random_engine; 27 | using thrust::random::uniform_int_distribution; 28 | 29 | namespace cusim { 30 | 31 | // Error Checking utilities, checks status codes from cuda calls 32 | // and throws exceptions on failure (which cython can proxy back to python) 33 | #define CHECK_CUDA(code) { checkCuda((code), __FILE__, __LINE__); } 34 | inline void checkCuda(cudaError_t code, const char *file, int line) { 35 | if (code != cudaSuccess) { 36 | std::stringstream err; 37 | err << "Cuda Error: " << cudaGetErrorString(code) << " (" << file << ":" << line << ")"; 38 | throw std::runtime_error(err.str()); 39 | } 40 | } 41 | 42 | // inline const char* cublasGetErrorString(cublasStatus_t status) { 43 | // switch (status) { 44 | // case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; 45 | // case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; 46 | // case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; 47 | // case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; 48 | // case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; 49 | // case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; 50 | // case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; 51 | // case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; 52 | // } 53 | // return "Unknown"; 54 | // } 55 | // 56 | // #define CHECK_CUBLAS(code) { checkCublas((code), __FILE__, __LINE__); } 57 | // inline void checkCublas(cublasStatus_t code, const char * file, int line) { 58 | // if (code != CUBLAS_STATUS_SUCCESS) { 59 | // std::stringstream err; 60 | // err << "cublas error: " << cublasGetErrorString(code) 61 | // << " (" << file << ":" << line << ")"; 62 | // throw std::runtime_error(err.str()); 63 | // } 64 | // } 65 | 66 | inline DeviceInfo GetDeviceInfo() { 67 | DeviceInfo ret; 68 | CHECK_CUDA(cudaGetDevice(&ret.devId)); 69 | cudaDeviceProp prop; 70 | CHECK_CUDA(cudaGetDeviceProperties(&prop, ret.devId)); 71 | ret.mp_cnt = prop.multiProcessorCount; 72 | ret.major = prop.major; 73 | ret.minor = prop.minor; 74 | // reference: https://stackoverflow.com/a/32531982 75 | switch (ret.major) { 76 | case 2: // Fermi 77 | if (ret.minor == 1) 78 | ret.cores = ret.mp_cnt * 48; 79 | else 80 | ret.cores = ret.mp_cnt * 32; 81 | break; 82 | case 3: // Kepler 83 | ret.cores = ret.mp_cnt * 192; 84 | break; 85 | case 5: // Maxwell 86 | ret.cores = ret.mp_cnt * 128; 87 | break; 88 | case 6: // Pascal 89 | if (ret.minor == 1 or ret.minor == 2) 90 | ret.cores = ret.mp_cnt * 128; 91 | else if (ret.minor == 0) 92 | ret.cores = ret.mp_cnt * 64; 93 | else 94 | ret.unknown = true; 95 | break; 96 | case 7: // Volta and Turing 97 | if (ret.minor == 0 or ret.minor == 5) 98 | ret.cores = ret.mp_cnt * 64; 99 | else 100 | ret.unknown = true; 101 | break; 102 | case 8: // Ampere 103 | if (ret.minor == 0) 104 | ret.cores = ret.mp_cnt * 64; 105 | else if (ret.minor == 6) 106 | ret.cores = ret.mp_cnt * 128; 107 | else 108 | ret.unknown = true; 109 | break; 110 | default: 111 | ret.unknown = true; 112 | break; 113 | } 114 | if (ret.cores == -1) ret.cores = ret.mp_cnt * 128; 115 | return ret; 116 | } 117 | 118 | __inline__ __device__ 119 | float warp_reduce_sum(float val) { 120 | #if __CUDACC_VER_MAJOR__ >= 9 121 | // __shfl_down is deprecated with cuda 9+. use newer variants 122 | unsigned int active = __activemask(); 123 | #pragma unroll 124 | for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { 125 | val += __shfl_down_sync(active, val, offset); 126 | } 127 | #else 128 | #pragma unroll 129 | for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { 130 | val += __shfl_down(val, offset); 131 | } 132 | #endif 133 | return val; 134 | } 135 | 136 | __inline__ __device__ 137 | float Dot(const float* vec1, const float* vec2, const int length) { 138 | 139 | static __shared__ float shared[32]; 140 | 141 | // figure out the warp/ position inside the warp 142 | int warp = threadIdx.x / WARP_SIZE; 143 | int lane = threadIdx.x % WARP_SIZE; 144 | 145 | // paritial sum 146 | float val = 0.0f; 147 | for (int i = threadIdx.x; i < length; i += blockDim.x) 148 | val += vec1[i] * vec2[i]; 149 | val = warp_reduce_sum(val); 150 | 151 | // write out the partial reduction to shared memory if appropiate 152 | if (lane == 0) { 153 | shared[warp] = val; 154 | } 155 | __syncthreads(); 156 | 157 | // if we we don't have multiple warps, we're done 158 | if (blockDim.x <= WARP_SIZE) { 159 | return shared[0]; 160 | } 161 | 162 | // otherwise reduce again in the first warp 163 | val = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane]: 0.0f; 164 | if (warp == 0) { 165 | val = warp_reduce_sum(val); 166 | // broadcast back to shared memory 167 | if (threadIdx.x == 0) { 168 | shared[0] = val; 169 | } 170 | } 171 | __syncthreads(); 172 | return shared[0]; 173 | } 174 | 175 | __inline__ __device__ 176 | float ReduceSum(const float* vec, const int length) { 177 | 178 | static __shared__ float shared[32]; 179 | 180 | // figure out the warp/ position inside the warp 181 | int warp = threadIdx.x / WARP_SIZE; 182 | int lane = threadIdx.x % WARP_SIZE; 183 | 184 | // paritial sum 185 | float val = 0.0f; 186 | for (int i = threadIdx.x; i < length; i += blockDim.x) 187 | val += vec[i]; 188 | val = warp_reduce_sum(val); 189 | 190 | // write out the partial reduction to shared memory if appropiate 191 | if (lane == 0) { 192 | shared[warp] = val; 193 | } 194 | __syncthreads(); 195 | 196 | // if we we don't have multiple warps, we're done 197 | if (blockDim.x <= WARP_SIZE) { 198 | return shared[0]; 199 | } 200 | 201 | // otherwise reduce again in the first warp 202 | val = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane]: 0.0f; 203 | if (warp == 0) { 204 | val = warp_reduce_sum(val); 205 | // broadcast back to shared memory 206 | if (threadIdx.x == 0) { 207 | shared[0] = val; 208 | } 209 | } 210 | __syncthreads(); 211 | return shared[0]; 212 | } 213 | 214 | __global__ void InitRngsKernel(default_random_engine* rngs, int rand_seed) { 215 | rngs[blockIdx.x].seed(blockIdx.x + rand_seed); 216 | } 217 | 218 | } // namespace cusim 219 | -------------------------------------------------------------------------------- /cpp/include/utils/ioutils.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include // NOLINT 22 | #include 23 | #include 24 | 25 | #include "json11.hpp" 26 | #include "utils/log.hpp" 27 | 28 | namespace cusim { 29 | 30 | class IoUtils { 31 | public: 32 | IoUtils(); 33 | ~IoUtils(); 34 | bool Init(std::string opt_path); 35 | int64_t LoadStreamFile(std::string filepath); 36 | std::pair ReadStreamForVocab(int num_lines, int num_threads); 37 | std::pair TokenizeStream(int num_lines, int num_threads); 38 | void GetWordVocab(int min_count, std::string keys_path, std::string count_path); 39 | void GetToken(int* rows, int* cols, int* indptr); 40 | std::tuple ReadBagOfWordsHeader(std::string filepath); 41 | void ReadBagOfWordsContent(int64_t* rows, int* cols, float* counts, const int num_lines); 42 | 43 | private: 44 | void ParseLine(std::string line, std::vector& line_vec); 45 | void ParseLineImpl(std::string line, std::vector& line_vec); 46 | 47 | std::vector> cols_; 48 | std::vector indptr_; 49 | std::mutex global_lock_; 50 | std::ifstream fin_; 51 | json11::Json opt_; 52 | std::shared_ptr logger_; 53 | std::unique_ptr logger_container_; 54 | std::unordered_map word_idmap_, word_count_; 55 | std::vector word_list_; 56 | int64_t num_lines_, remain_lines_; 57 | bool lower_; 58 | }; // class IoUtils 59 | 60 | } // namespace cusim 61 | -------------------------------------------------------------------------------- /cpp/include/utils/log.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | 7 | // reference: https://github.com/kakao/buffalo/blob/5f571c2c7d8227e6625c6e538da929e4db11b66d/lib/misc/log.cc 8 | #pragma once 9 | #include 10 | #include 11 | #define SPDLOG_EOL "" 12 | #define SPDLOG_TRACE_ON 13 | #include "spdlog/spdlog.h" 14 | #include "spdlog/sinks/stdout_color_sinks.h" 15 | 16 | #define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) 17 | 18 | #define INFO(x, ...) logger_->info("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__); 19 | #define DEBUG(x, ...) logger_->debug("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__); 20 | #define WARN(x, ...) logger_->warn("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__); 21 | #define TRACE(x, ...) logger_->trace("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__); 22 | #define CRITICAL(x, ...) logger_->critical("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__); 23 | 24 | #define INFO0(x) logger_->info("[{}:{}] " x "\n", __FILENAME__, __LINE__); 25 | #define DEBUG0(x) logger_->debug("[{}:{}] " x "\n", __FILENAME__, __LINE__); 26 | #define WARN0(x) logger_->warn("[{}:{}] " x "\n", __FILENAME__, __LINE__); 27 | #define TRACE0(x) logger_->trace("[{}:{}] " x "\n", __FILENAME__, __LINE__); 28 | #define CRITICAL0(x) logger_->critical("[{}:{}] " x "\n", __FILENAME__, __LINE__); 29 | 30 | namespace cusim { 31 | 32 | class CuSimLogger { 33 | public: 34 | CuSimLogger(); 35 | explicit CuSimLogger(std::string name); 36 | std::shared_ptr& get_logger(); 37 | void set_log_level(int level); 38 | int get_log_level(); 39 | 40 | private: 41 | static int global_logging_level_; 42 | std::shared_ptr logger_; 43 | }; // class CuSimLogger 44 | 45 | } // namespace cusim 46 | -------------------------------------------------------------------------------- /cpp/include/utils/types.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #pragma once 7 | 8 | struct DeviceInfo { 9 | int devId, mp_cnt, major, minor, cores; 10 | bool unknown = false; 11 | }; 12 | 13 | #define WARP_SIZE 32 14 | #define EPS 1e-10f 15 | -------------------------------------------------------------------------------- /cpp/src/culda/culda.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #include "culda/culda.hpp" 7 | #include "culda/cuda_lda_kernels.cuh" 8 | 9 | namespace cusim { 10 | 11 | CuLDA::CuLDA() { 12 | logger_container_.reset(new CuSimLogger("lda")); 13 | logger_ = logger_container_->get_logger(); 14 | dev_info_ = GetDeviceInfo(); 15 | if (dev_info_.unknown) DEBUG0("Unknown device type"); 16 | INFO("cuda device info, major: {}, minor: {}, multi processors: {}, cores: {}", 17 | dev_info_.major, dev_info_.minor, dev_info_.mp_cnt, dev_info_.cores); 18 | } 19 | 20 | CuLDA::~CuLDA() {} 21 | 22 | bool CuLDA::Init(std::string opt_path) { 23 | std::ifstream in(opt_path.c_str()); 24 | if (not in.is_open()) return false; 25 | 26 | std::string str((std::istreambuf_iterator(in)), 27 | std::istreambuf_iterator()); 28 | std::string err_cmt; 29 | auto _opt = json11::Json::parse(str, err_cmt); 30 | if (not err_cmt.empty()) return false; 31 | opt_ = _opt; 32 | logger_container_->set_log_level(opt_["c_log_level"].int_value()); 33 | num_topics_ = opt_["num_topics"].int_value(); 34 | block_dim_ = opt_["block_dim"].int_value(); 35 | block_cnt_ = opt_["hyper_threads"].number_value() * (dev_info_.cores / block_dim_); 36 | INFO("num_topics: {}, block_dim: {}, block_cnt: {}", num_topics_, block_dim_, block_cnt_); 37 | return true; 38 | } 39 | 40 | void CuLDA::LoadModel(float* alpha, float* beta, 41 | float* grad_alpha, float* new_beta, int num_words) { 42 | num_words_ = num_words; 43 | DEBUG("copy model({} x {})", num_words_, num_topics_); 44 | dev_alpha_.resize(num_topics_); 45 | dev_beta_.resize(num_topics_ * num_words_); 46 | thrust::copy(alpha, alpha + num_topics_, dev_alpha_.begin()); 47 | thrust::copy(beta, beta + num_topics_ * num_words_, dev_beta_.begin()); 48 | alpha_ = alpha; beta_ = beta; 49 | 50 | // resize device vector 51 | grad_alpha_ = grad_alpha; 52 | new_beta_ = new_beta; 53 | dev_grad_alpha_.resize(num_topics_ * block_cnt_); 54 | dev_new_beta_.resize(num_topics_ * num_words_); 55 | // copy to device 56 | thrust::copy(grad_alpha_, grad_alpha_ + block_cnt_ * num_topics_, dev_grad_alpha_.begin()); 57 | thrust::copy(new_beta_, new_beta_ + num_words_ * num_topics_, dev_new_beta_.begin()); 58 | // set locks 59 | dev_locks_.resize(num_words_); 60 | std::vector host_locks(num_words_, 0); 61 | thrust::copy(host_locks.begin(), host_locks.end(), dev_locks_.begin()); 62 | 63 | CHECK_CUDA(cudaDeviceSynchronize()); 64 | } 65 | 66 | std::pair CuLDA::FeedData( 67 | const int* cols, const int* indptr, 68 | const bool* vali, const float* counts, float* gamma, 69 | const bool init_gamma, const int num_cols, const int num_indptr, 70 | const int num_iters) { 71 | 72 | // copy feed data to GPU memory 73 | thrust::device_vector dev_cols(num_cols); 74 | thrust::device_vector dev_indptr(num_indptr + 1); 75 | thrust::device_vector dev_vali(num_cols); 76 | thrust::device_vector dev_counts(num_cols); 77 | thrust::device_vector dev_gamma(num_indptr * num_topics_); 78 | thrust::device_vector dev_train_losses(block_cnt_, 0.0f); 79 | thrust::device_vector dev_vali_losses(block_cnt_, 0.0f); 80 | thrust::copy(cols, cols + num_cols, dev_cols.begin()); 81 | thrust::copy(indptr, indptr + num_indptr + 1, dev_indptr.begin()); 82 | thrust::copy(vali, vali + num_cols, dev_vali.begin()); 83 | thrust::copy(counts, counts + num_cols, dev_counts.begin()); 84 | thrust::copy(gamma, gamma + num_indptr * num_topics_, dev_gamma.begin()); 85 | CHECK_CUDA(cudaDeviceSynchronize()); 86 | DEBUG0("copy feed data to GPU memory"); 87 | 88 | // run E step in GPU 89 | EstepKernel<<>>( 91 | thrust::raw_pointer_cast(dev_cols.data()), 92 | thrust::raw_pointer_cast(dev_indptr.data()), 93 | thrust::raw_pointer_cast(dev_vali.data()), 94 | thrust::raw_pointer_cast(dev_counts.data()), 95 | init_gamma, num_cols, num_indptr, num_topics_, num_iters, 96 | thrust::raw_pointer_cast(dev_alpha_.data()), 97 | thrust::raw_pointer_cast(dev_beta_.data()), 98 | thrust::raw_pointer_cast(dev_gamma.data()), 99 | thrust::raw_pointer_cast(dev_grad_alpha_.data()), 100 | thrust::raw_pointer_cast(dev_new_beta_.data()), 101 | thrust::raw_pointer_cast(dev_train_losses.data()), 102 | thrust::raw_pointer_cast(dev_vali_losses.data()), 103 | thrust::raw_pointer_cast(dev_locks_.data())); 104 | CHECK_CUDA(cudaDeviceSynchronize()); 105 | DEBUG0("run E step in GPU"); 106 | 107 | // pull loss 108 | std::vector train_losses(block_cnt_), vali_losses(block_cnt_); 109 | thrust::copy(dev_train_losses.begin(), dev_train_losses.end(), train_losses.begin()); 110 | thrust::copy(dev_vali_losses.begin(), dev_vali_losses.end(), vali_losses.begin()); 111 | thrust::copy(dev_gamma.begin(), dev_gamma.end(), gamma); 112 | CHECK_CUDA(cudaDeviceSynchronize()); 113 | DEBUG0("pull loss values"); 114 | 115 | // accumulate 116 | float train_loss = std::accumulate(train_losses.begin(), train_losses.end(), 0.0f); 117 | float vali_loss = std::accumulate(vali_losses.begin(), vali_losses.end(), 0.0f); 118 | return {train_loss, vali_loss}; 119 | } 120 | 121 | void CuLDA::Pull() { 122 | thrust::copy(dev_grad_alpha_.begin(), dev_grad_alpha_.end(), grad_alpha_); 123 | thrust::copy(dev_new_beta_.begin(), dev_new_beta_.end(), new_beta_); 124 | CHECK_CUDA(cudaDeviceSynchronize()); 125 | } 126 | 127 | void CuLDA::Push() { 128 | thrust::copy(alpha_, alpha_ + num_topics_, dev_alpha_.begin()); 129 | thrust::copy(grad_alpha_, grad_alpha_ + block_cnt_ * num_topics_, dev_grad_alpha_.begin()); 130 | thrust::copy(beta_, beta_ + num_words_ * num_topics_, dev_beta_.begin()); 131 | thrust::copy(new_beta_, new_beta_ + num_words_ * num_topics_, dev_new_beta_.begin()); 132 | CHECK_CUDA(cudaDeviceSynchronize()); 133 | } 134 | 135 | int CuLDA::GetBlockCnt() { 136 | return block_cnt_; 137 | } 138 | 139 | } // namespace cusim 140 | -------------------------------------------------------------------------------- /cpp/src/cuw2v/cuw2v.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #include "cuw2v/cuw2v.hpp" 7 | #include "cuw2v/cuda_w2v_base_kernels.cuh" 8 | #include "cuw2v/cuda_w2v_ns_kernels.cuh" 9 | #include "cuw2v/cuda_w2v_hs_kernels.cuh" 10 | 11 | namespace cusim { 12 | 13 | struct HuffmanTreeNode { 14 | float count; 15 | int index, left, right; 16 | HuffmanTreeNode(float count0, int index0, int left0, int right0) { 17 | count = count0; index = index0; left = left0; right = right0; 18 | } 19 | }; 20 | 21 | std::vector huffman_nodes; 22 | bool CompareIndex(int lhs, int rhs) { 23 | return huffman_nodes[lhs].count > huffman_nodes[rhs].count; 24 | } 25 | 26 | CuW2V::CuW2V() { 27 | logger_container_.reset(new CuSimLogger("w2v")); 28 | logger_ = logger_container_->get_logger(); 29 | dev_info_ = GetDeviceInfo(); 30 | if (dev_info_.unknown) DEBUG0("Unknown device type"); 31 | INFO("cuda device info, major: {}, minor: {}, multi processors: {}, cores: {}", 32 | dev_info_.major, dev_info_.minor, dev_info_.mp_cnt, dev_info_.cores); 33 | } 34 | 35 | CuW2V::~CuW2V() {} 36 | 37 | bool CuW2V::Init(std::string opt_path) { 38 | std::ifstream in(opt_path.c_str()); 39 | if (not in.is_open()) return false; 40 | 41 | std::string str((std::istreambuf_iterator(in)), 42 | std::istreambuf_iterator()); 43 | std::string err_cmt; 44 | auto _opt = json11::Json::parse(str, err_cmt); 45 | if (not err_cmt.empty()) return false; 46 | opt_ = _opt; 47 | logger_container_->set_log_level(opt_["c_log_level"].int_value()); 48 | num_dims_ = opt_["num_dims"].int_value(); 49 | block_dim_ = opt_["block_dim"].int_value(); 50 | block_cnt_ = opt_["hyper_threads"].number_value() * (dev_info_.cores / block_dim_); 51 | sg_ = opt_["skip_gram"].bool_value(); 52 | cbow_mean_ = opt_["cbow_mean"].bool_value(); 53 | window_size_ = opt_["window_size"].int_value(); 54 | lr_ = opt_["lr"].number_value(); 55 | 56 | // if zero, we will use hierarchical softmax 57 | neg_ = opt_["neg"].int_value(); 58 | 59 | // random seed 60 | seed_ = opt_["seed"].int_value(); 61 | dev_rngs_.resize(block_cnt_); 62 | InitRngsKernel<<>>( 63 | thrust::raw_pointer_cast(dev_rngs_.data()), seed_); 64 | 65 | INFO("num_dims: {}, block_dim: {}, block_cnt: {}, objective type: {}, neg: {}", 66 | num_dims_, block_dim_, block_cnt_, sg_? "skip gram": "cbow", neg_); 67 | return true; 68 | } 69 | 70 | void CuW2V::BuildRandomTable(const double* word_count, const int num_words, const int table_size) { 71 | num_words_ = num_words; 72 | std::vector host_random_table; 73 | for (int i = 0; i < num_words; ++i) { 74 | int weight = std::max(1, static_cast(word_count[i] * static_cast(table_size))); 75 | for (int j = 0; j < weight; ++j) 76 | host_random_table.push_back(i); 77 | } 78 | 79 | random_size_ = host_random_table.size(); 80 | dev_random_table_.resize(random_size_); 81 | thrust::copy(host_random_table.begin(), host_random_table.end(), dev_random_table_.begin()); 82 | CHECK_CUDA(cudaDeviceSynchronize()); 83 | 84 | INFO("random table initialzied, size: {} => {}", table_size, random_size_); 85 | } 86 | 87 | void CuW2V::BuildHuffmanTree(const float* word_count, const int num_words) { 88 | num_words_ = num_words; 89 | 90 | huffman_nodes.clear(); 91 | std::priority_queue, decltype(&CompareIndex)> pq(CompareIndex); 92 | for (int i = 0; i < num_words; ++i) { 93 | huffman_nodes.emplace_back(word_count[i], i, -1, -1); 94 | pq.push(i); 95 | } 96 | for (int i = 0; i < num_words - 1; ++i) { 97 | auto& min1 = huffman_nodes[pq.top()]; pq.pop(); 98 | auto& min2 = huffman_nodes[pq.top()]; pq.pop(); 99 | huffman_nodes.emplace_back(min1.count + min2.count, i + num_words, min1.index, min2.index); 100 | pq.push(i + num_words); 101 | } 102 | 103 | std::vector, std::vector>> stack = {{pq.top(), {}, {}}}; 104 | int nodeid; 105 | std::vector code; 106 | std::vector point; 107 | std::vector> codes(num_words); 108 | std::vector> points(num_words); 109 | max_depth_ = 0; 110 | while (not stack.empty()) { 111 | std::tie(nodeid, code, point) = stack.back(); 112 | stack.pop_back(); 113 | if (nodeid < num_words) { 114 | codes[nodeid] = code; 115 | points[nodeid] = point; 116 | max_depth_ = std::max(max_depth_, 117 | static_cast(code.size())); 118 | } else { 119 | point.push_back(nodeid - num_words); 120 | std::vector left_code = code; 121 | std::vector right_code = code; 122 | left_code.push_back(false); 123 | right_code.push_back(true); 124 | auto& node = huffman_nodes[nodeid]; 125 | stack.push_back(make_tuple(node.left, left_code, point)); 126 | stack.push_back(make_tuple(node.right, right_code, point)); 127 | } 128 | } 129 | 130 | std::vector host_codes; 131 | std::vector host_points; 132 | std::vector host_hs_indptr = {0}; 133 | int size = 0; 134 | for (int i = 0; i < num_words; ++i) { 135 | code = codes[i]; 136 | point = points[i]; 137 | int n = code.size(); 138 | size += n; 139 | host_hs_indptr.push_back(size); 140 | for (int j = 0; j < n; ++j) { 141 | host_codes.push_back(code[j]); 142 | host_points.push_back(point[j]); 143 | } 144 | } 145 | 146 | dev_codes_.resize(size); dev_points_.resize(size), dev_hs_indptr_.resize(num_words + 1); 147 | thrust::copy(host_codes.begin(), host_codes.end(), dev_codes_.begin()); 148 | thrust::copy(host_points.begin(), host_points.end(), dev_points_.begin()); 149 | thrust::copy(host_hs_indptr.begin(), host_hs_indptr.end(), dev_hs_indptr_.begin()); 150 | CHECK_CUDA(cudaDeviceSynchronize()); 151 | 152 | huffman_nodes.clear(); 153 | } 154 | 155 | void CuW2V::LoadModel(float* emb_in, float* emb_out) { 156 | int out_words = neg_? num_words_: num_words_ - 1; 157 | 158 | // copy embedding 159 | DEBUG("copy model({} x {})", num_words_, num_dims_); 160 | dev_emb_in_.resize(num_words_ * num_dims_); 161 | dev_emb_out_.resize(out_words * num_dims_); 162 | thrust::copy(emb_in, emb_in + num_words_ * num_dims_, dev_emb_in_.begin()); 163 | thrust::copy(emb_out, emb_out + out_words * num_dims_, dev_emb_out_.begin()); 164 | emb_in_ = emb_in; emb_out_ = emb_out; 165 | 166 | CHECK_CUDA(cudaDeviceSynchronize()); 167 | } 168 | 169 | 170 | std::pair CuW2V::FeedData(const int* cols, const int* indptr, 171 | const int num_cols, const int num_indptr) { 172 | 173 | // copy feed data to GPU memory 174 | thrust::device_vector dev_cols(num_cols); 175 | thrust::device_vector dev_indptr(num_indptr + 1); 176 | thrust::device_vector dev_loss_nume(block_cnt_, 0.0f); 177 | thrust::device_vector dev_loss_deno(block_cnt_, 0.0f); 178 | thrust::copy(cols, cols + num_cols, dev_cols.begin()); 179 | thrust::copy(indptr, indptr + num_indptr + 1, dev_indptr.begin()); 180 | CHECK_CUDA(cudaDeviceSynchronize()); 181 | DEBUG0("copy feed data to GPU memory"); 182 | 183 | // run GPU kernels 184 | if (neg_ > 0) { 185 | if (sg_) { 186 | W2VNegSgKernel<<>>( 187 | thrust::raw_pointer_cast(dev_cols.data()), 188 | thrust::raw_pointer_cast(dev_indptr.data()), 189 | thrust::raw_pointer_cast(dev_random_table_.data()), 190 | thrust::raw_pointer_cast(dev_rngs_.data()), 191 | random_size_, num_indptr, num_dims_, neg_, window_size_, 192 | thrust::raw_pointer_cast(dev_emb_in_.data()), 193 | thrust::raw_pointer_cast(dev_emb_out_.data()), 194 | thrust::raw_pointer_cast(dev_loss_nume.data()), 195 | thrust::raw_pointer_cast(dev_loss_deno.data()), 196 | lr_); 197 | } else { 198 | W2VNegCbowKernel<<>>( 199 | thrust::raw_pointer_cast(dev_cols.data()), 200 | thrust::raw_pointer_cast(dev_indptr.data()), 201 | thrust::raw_pointer_cast(dev_random_table_.data()), 202 | thrust::raw_pointer_cast(dev_rngs_.data()), 203 | random_size_, num_indptr, num_dims_, neg_, window_size_, 204 | thrust::raw_pointer_cast(dev_emb_in_.data()), 205 | thrust::raw_pointer_cast(dev_emb_out_.data()), 206 | thrust::raw_pointer_cast(dev_loss_nume.data()), 207 | thrust::raw_pointer_cast(dev_loss_deno.data()), 208 | cbow_mean_, lr_); 209 | } 210 | } else { 211 | if (sg_) { 212 | W2VHsSgKernel<<>>( 213 | thrust::raw_pointer_cast(dev_cols.data()), 214 | thrust::raw_pointer_cast(dev_indptr.data()), 215 | thrust::raw_pointer_cast(dev_codes_.data()), 216 | thrust::raw_pointer_cast(dev_points_.data()), 217 | thrust::raw_pointer_cast(dev_hs_indptr_.data()), 218 | num_indptr, num_dims_, window_size_, 219 | thrust::raw_pointer_cast(dev_rngs_.data()), 220 | thrust::raw_pointer_cast(dev_emb_in_.data()), 221 | thrust::raw_pointer_cast(dev_emb_out_.data()), 222 | thrust::raw_pointer_cast(dev_loss_nume.data()), 223 | thrust::raw_pointer_cast(dev_loss_deno.data()), 224 | lr_); 225 | 226 | } else { 227 | W2VHsCbowKernel<<>>( 228 | thrust::raw_pointer_cast(dev_cols.data()), 229 | thrust::raw_pointer_cast(dev_indptr.data()), 230 | thrust::raw_pointer_cast(dev_codes_.data()), 231 | thrust::raw_pointer_cast(dev_points_.data()), 232 | thrust::raw_pointer_cast(dev_hs_indptr_.data()), 233 | num_indptr, num_dims_, window_size_, 234 | thrust::raw_pointer_cast(dev_rngs_.data()), 235 | thrust::raw_pointer_cast(dev_emb_in_.data()), 236 | thrust::raw_pointer_cast(dev_emb_out_.data()), 237 | thrust::raw_pointer_cast(dev_loss_nume.data()), 238 | thrust::raw_pointer_cast(dev_loss_deno.data()), 239 | cbow_mean_, lr_); 240 | 241 | } 242 | 243 | } 244 | CHECK_CUDA(cudaDeviceSynchronize()); 245 | 246 | // accumulate loss nume / deno 247 | std::vector loss_nume(block_cnt_), loss_deno(block_cnt_); 248 | thrust::copy(dev_loss_nume.begin(), dev_loss_nume.end(), loss_nume.begin()); 249 | thrust::copy(dev_loss_deno.begin(), dev_loss_deno.end(), loss_deno.begin()); 250 | CHECK_CUDA(cudaDeviceSynchronize()); 251 | float loss_nume_sum = std::accumulate(loss_nume.begin(), loss_nume.end(), 0.0f); 252 | float loss_deno_sum = std::accumulate(loss_deno.begin(), loss_deno.end(), 0.0f); 253 | DEBUG("loss nume: {}, deno: {}", loss_nume_sum, loss_deno_sum); 254 | 255 | return {loss_nume_sum, loss_deno_sum}; 256 | } 257 | 258 | void CuW2V::Pull() { 259 | thrust::copy(dev_emb_in_.begin(), dev_emb_in_.end(), emb_in_); 260 | thrust::copy(dev_emb_out_.begin(), dev_emb_out_.end(), emb_out_); 261 | CHECK_CUDA(cudaDeviceSynchronize()); 262 | } 263 | 264 | } // namespace cusim 265 | -------------------------------------------------------------------------------- /cpp/src/utils/ioutils.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #include "utils/ioutils.hpp" 7 | 8 | namespace cusim { 9 | 10 | IoUtils::IoUtils() { 11 | logger_container_.reset(new CuSimLogger("ioutils")); 12 | logger_ = logger_container_->get_logger(); 13 | } 14 | 15 | IoUtils::~IoUtils() {} 16 | 17 | bool IoUtils::Init(std::string opt_path) { 18 | std::ifstream in(opt_path.c_str()); 19 | if (not in.is_open()) return false; 20 | 21 | std::string str((std::istreambuf_iterator(in)), 22 | std::istreambuf_iterator()); 23 | std::string err_cmt; 24 | auto _opt = json11::Json::parse(str, err_cmt); 25 | if (not err_cmt.empty()) return false; 26 | opt_ = _opt; 27 | logger_container_->set_log_level(opt_["c_log_level"].int_value()); 28 | lower_ = opt_["lower"].bool_value(); 29 | return true; 30 | } 31 | 32 | void IoUtils::ParseLine(std::string line, std::vector& ret) { 33 | ParseLineImpl(line, ret); 34 | } 35 | 36 | 37 | void IoUtils::ParseLineImpl(std::string line, std::vector& ret) { 38 | ret.clear(); 39 | int n = line.size(); 40 | std::string element; 41 | for (int i = 0; i < n; ++i) { 42 | if (line[i] == ' ') { 43 | ret.push_back(element); 44 | element.clear(); 45 | } else { 46 | element += (lower_? std::tolower(line[i]): line[i]); 47 | } 48 | } 49 | if (element.size() > 0) { 50 | ret.push_back(element); 51 | } 52 | } 53 | 54 | int64_t IoUtils::LoadStreamFile(std::string filepath) { 55 | INFO("read stream file to generate vocabulary: {}", filepath); 56 | if (fin_.is_open()) fin_.close(); 57 | fin_.open(filepath.c_str()); 58 | int64_t count = 0; 59 | std::string line; 60 | while (getline(fin_, line)) 61 | count++; 62 | fin_.close(); 63 | fin_.open(filepath.c_str()); 64 | num_lines_ = count; 65 | remain_lines_ = num_lines_; 66 | INFO("number of lines: {}", num_lines_); 67 | return count; 68 | } 69 | 70 | std::pair IoUtils::TokenizeStream(int num_lines, int num_threads) { 71 | int read_lines = static_cast(std::min(static_cast(num_lines), remain_lines_)); 72 | if (not read_lines) return {0, 0}; 73 | remain_lines_ -= read_lines; 74 | cols_.clear(); 75 | cols_.resize(read_lines); 76 | indptr_.resize(read_lines); 77 | std::fill(indptr_.begin(), indptr_.end(), 0); 78 | #pragma omp parallel num_threads(num_threads) 79 | { 80 | std::string line; 81 | std::vector line_vec; 82 | #pragma omp for schedule(dynamic, 4) 83 | for (int i = 0; i < read_lines; ++i) { 84 | // get line thread-safely 85 | { 86 | std::unique_lock lock(global_lock_); 87 | getline(fin_, line); 88 | } 89 | 90 | // seems to be bottle-neck 91 | ParseLine(line, line_vec); 92 | 93 | // tokenize 94 | for (auto& word: line_vec) { 95 | if (not word_idmap_.count(word)) continue; 96 | cols_[i].push_back(word_idmap_[word]); 97 | } 98 | } 99 | } 100 | int cumsum = 0; 101 | for (int i = 0; i < read_lines; ++i) { 102 | cumsum += cols_[i].size(); 103 | indptr_[i] = cumsum; 104 | } 105 | return {read_lines, indptr_[read_lines - 1]}; 106 | } 107 | 108 | void IoUtils::GetToken(int* rows, int* cols, int* indptr) { 109 | int n = cols_.size(); 110 | for (int i = 0; i < n; ++i) { 111 | int beg = i == 0? 0: indptr_[i - 1]; 112 | int end = indptr_[i]; 113 | for (int j = beg; j < end; ++j) { 114 | rows[j] = i; 115 | cols[j] = cols_[i][j - beg]; 116 | } 117 | indptr[i] = indptr_[i]; 118 | } 119 | } 120 | 121 | std::pair IoUtils::ReadStreamForVocab(int num_lines, int num_threads) { 122 | int read_lines = static_cast(std::min(static_cast(num_lines), remain_lines_)); 123 | remain_lines_ -= read_lines; 124 | #pragma omp parallel num_threads(num_threads) 125 | { 126 | std::string line; 127 | std::vector line_vec; 128 | std::unordered_map word_count; 129 | #pragma omp for schedule(dynamic, 4) 130 | for (int i = 0; i < read_lines; ++i) { 131 | // get line thread-safely 132 | { 133 | std::unique_lock lock(global_lock_); 134 | getline(fin_, line); 135 | } 136 | 137 | // seems to be bottle-neck 138 | ParseLine(line, line_vec); 139 | 140 | // update private word count 141 | for (auto& word: line_vec) { 142 | word_count[word]++; 143 | } 144 | } 145 | 146 | // update word count to class variable 147 | { 148 | std::unique_lock lock(global_lock_); 149 | for (auto& it: word_count) { 150 | word_count_[it.first] += it.second; 151 | } 152 | } 153 | } 154 | if (not remain_lines_) fin_.close(); 155 | return {read_lines, word_count_.size()}; 156 | } 157 | 158 | void IoUtils::GetWordVocab(int min_count, std::string keys_path, std::string count_path) { 159 | INFO("number of raw words: {}", word_count_.size()); 160 | word_idmap_.clear(); word_list_.clear(); 161 | for (auto& it: word_count_) { 162 | if (it.second >= min_count) { 163 | word_idmap_[it.first] = word_idmap_.size(); 164 | word_list_.push_back(it.first); 165 | } 166 | } 167 | INFO("number of words after filtering: {}", word_list_.size()); 168 | 169 | // write keys and count to csv file 170 | std::ofstream fout1(keys_path.c_str()); 171 | std::ofstream fout2(count_path.c_str()); 172 | INFO("dump keys to {}", keys_path); 173 | int n = word_list_.size(); 174 | for (int i = 0; i < n; ++i) { 175 | std::string line = word_list_[i] + "\n"; 176 | fout1.write(line.c_str(), line.size()); 177 | line = std::to_string(word_count_[word_list_[i]]) + "\n"; 178 | fout2.write(line.c_str(), line.size()); 179 | } 180 | fout1.close(); fout2.close(); 181 | } 182 | 183 | std::tuple IoUtils::ReadBagOfWordsHeader(std::string filepath) { 184 | INFO("read bag of words file: {} (format reference: https://archive.ics.uci.edu/ml/datasets/bag+of+words)", 185 | filepath); 186 | if (fin_.is_open()) fin_.close(); 187 | fin_.open(filepath.c_str()); 188 | std::string line; 189 | std::stringstream sstr; 190 | int64_t num_docs, nnz; 191 | int num_words; 192 | getline(fin_, line); 193 | sstr << line; sstr >> num_docs; sstr.clear(); 194 | getline(fin_, line); 195 | num_words = std::stoi(line); 196 | getline(fin_, line); 197 | sstr << line; sstr >> nnz; sstr.clear(); 198 | return {num_docs, num_words, nnz}; 199 | } 200 | 201 | void IoUtils::ReadBagOfWordsContent(int64_t* rows, int* cols, float* counts, const int num_lines) { 202 | if (not fin_.is_open()) throw std::runtime_error("file is not open"); 203 | std::string line; 204 | std::stringstream sstr; 205 | int64_t row; 206 | int col; 207 | float count; 208 | std::vector line_vec; 209 | for (int i = 0; i < num_lines; ++i) { 210 | getline(fin_, line); 211 | ParseLine(line, line_vec); 212 | sstr << line_vec[0]; sstr >> row; sstr.clear(); 213 | col = std::stoi(line_vec[1]); 214 | count = std::stof(line_vec[2]); 215 | rows[i] = row - 1; cols[i] = col - 1; counts[i] = count; 216 | line_vec.clear(); 217 | } 218 | if (fin_.eof()) fin_.close(); 219 | } 220 | 221 | } // namespace cusim 222 | -------------------------------------------------------------------------------- /cpp/src/utils/log.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | 7 | // reference: https://github.com/kakao/buffalo/blob/5f571c2c7d8227e6625c6e538da929e4db11b66d/lib/misc/log.cc 8 | #include "utils/log.hpp" 9 | 10 | 11 | namespace cusim { 12 | int CuSimLogger::global_logging_level_ = 2; 13 | 14 | CuSimLogger::CuSimLogger() { 15 | spdlog::set_pattern("[%^%-8l%$] %Y-%m-%d %H:%M:%S %v"); 16 | logger_ = spdlog::default_logger(); 17 | } 18 | 19 | CuSimLogger::CuSimLogger(std::string name) { 20 | // auto console_sink = std::make_shared(); 21 | auto stderr_sink = std::make_shared(); 22 | // spdlog::sinks_init_list sinks = {console_sink, stderr_sink}; 23 | logger_ = std::make_shared(name, stderr_sink); 24 | logger_->set_pattern("[%^%-8l%$] %Y-%m-%d %H:%M:%S %v"); 25 | } 26 | 27 | std::shared_ptr& CuSimLogger::get_logger() { 28 | return logger_; 29 | } 30 | 31 | void CuSimLogger::set_log_level(int level) { 32 | global_logging_level_ = level; 33 | switch (level) { 34 | case 0: logger_->set_level(spdlog::level::off); break; 35 | case 1: logger_->set_level(spdlog::level::warn); break; 36 | case 2: logger_->set_level(spdlog::level::info); break; 37 | case 3: logger_->set_level(spdlog::level::debug); break; 38 | default: logger_->set_level(spdlog::level::trace); break; 39 | } 40 | } 41 | 42 | int CuSimLogger::get_log_level() { 43 | return global_logging_level_; 44 | } 45 | 46 | } // namespace cusim 47 | -------------------------------------------------------------------------------- /cuda_setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # Adapted from https://github.com/rmcgibbo/npcuda-example and 8 | # https://github.com/cupy/cupy/blob/master/cupy_setup_build.py 9 | # pylint: disable=fixme,access-member-before-definition 10 | # pylint: disable=attribute-defined-outside-init,arguments-differ 11 | import logging 12 | import os 13 | import sys 14 | 15 | from distutils import ccompiler, errors, msvccompiler, unixccompiler 16 | from setuptools.command.build_ext import build_ext as setuptools_build_ext 17 | 18 | HALF_PRECISION = False 19 | 20 | def find_in_path(name, path): 21 | "Find a file in a search path" 22 | # adapted fom http://code.activestate.com/ 23 | # recipes/52224-find-a-file-given-a-search-path/ 24 | for _dir in path.split(os.pathsep): 25 | binpath = os.path.join(_dir, name) 26 | if os.path.exists(binpath): 27 | return os.path.abspath(binpath) 28 | return None 29 | 30 | # reference: https://arnon.dk/ 31 | # matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ 32 | def get_cuda_sm_list(cuda_ver): 33 | if "CUDA_SM_LIST" in os.environ: 34 | sm_list = os.environ["CUDA_SM_LIST"].split(",") 35 | else: 36 | sm_list = ["30", "52", "60", "61", "70", "75", "80", "86"] 37 | if cuda_ver >= 110: 38 | filter_list = ["30"] 39 | if cuda_ver == 110: 40 | filter_list += ["86"] 41 | else: 42 | filter_list = ["80", "86"] 43 | if cuda_ver < 100: 44 | filter_list += ["75"] 45 | if cuda_ver < 90: 46 | filter_list += ["70"] 47 | if cuda_ver < 80: 48 | filter_list += ["60", "61"] 49 | sm_list = [sm for sm in sm_list if sm not in filter_list] 50 | return sm_list 51 | 52 | 53 | def get_cuda_compute(cuda_ver): 54 | if "CUDA_COMPUTE" in os.environ: 55 | compute = os.environ["CUDA_COMPUTE"] 56 | else: 57 | if 70 <= cuda_ver < 80: 58 | compute = "52" 59 | if 80 <= cuda_ver < 90: 60 | compute = "61" 61 | if 90 <= cuda_ver < 100: 62 | compute = "70" 63 | if 100 <= cuda_ver < 110: 64 | compute = "75" 65 | if cuda_ver == 110: 66 | compute = "80" 67 | if cuda_ver == 111: 68 | compute = "86" 69 | return compute 70 | 71 | 72 | def get_cuda_arch(cuda_ver): 73 | if "CUDA_ARCH" in os.environ: 74 | arch = os.environ["CUDA_ARCH"] 75 | else: 76 | if 70 <= cuda_ver < 92: 77 | arch = "30" 78 | if 92 <= cuda_ver < 110: 79 | arch = "50" 80 | if cuda_ver == 110: 81 | arch = "52" 82 | if cuda_ver == 111: 83 | arch = "80" 84 | return arch 85 | 86 | def locate_cuda(): 87 | """Locate the CUDA environment on the system 88 | If a valid cuda installation is found 89 | this returns a dict with keys 'home', 'nvcc', 'include', 90 | and 'lib64' and values giving the absolute path to each directory. 91 | Starts by looking for the CUDAHOME env variable. 92 | If not found, everything is based on finding 93 | 'nvcc' in the PATH. 94 | If nvcc can't be found, this returns None 95 | """ 96 | nvcc_bin = 'nvcc' 97 | if sys.platform.startswith("win"): 98 | nvcc_bin = 'nvcc.exe' 99 | 100 | # check env variables CUDA_HOME, CUDAHOME, CUDA_PATH. 101 | found = False 102 | for env_name in ['CUDA_PATH', 'CUDAHOME', 'CUDA_HOME']: 103 | if env_name not in os.environ: 104 | continue 105 | found = True 106 | home = os.environ[env_name] 107 | nvcc = os.path.join(home, 'bin', nvcc_bin) 108 | break 109 | if not found: 110 | # otherwise, search the PATH for NVCC 111 | nvcc = find_in_path(nvcc_bin, os.environ['PATH']) 112 | if nvcc is None: 113 | logging.warning('The nvcc binary could not be located in your ' 114 | '$PATH. Either add it to ' 115 | 'your path, or set $CUDA_HOME to enable CUDA extensions') 116 | return None 117 | home = os.path.dirname(os.path.dirname(nvcc)) 118 | cudaconfig = {'home': home, 119 | 'nvcc': nvcc, 120 | 'include': os.path.join(home, 'include'), 121 | 'lib64': os.path.join(home, 'lib64')} 122 | cuda_ver = os.path.basename(os.path.realpath(home)).split("-")[1].split(".") 123 | major, minor = int(cuda_ver[0]), int(cuda_ver[1]) 124 | cuda_ver = 10 * major + minor 125 | assert cuda_ver >= 70, f"too low cuda ver {major}.{minor}" 126 | print(f"cuda_ver: {major}.{minor}") 127 | arch = get_cuda_arch(cuda_ver) 128 | sm_list = get_cuda_sm_list(cuda_ver) 129 | compute = get_cuda_compute(cuda_ver) 130 | post_args = [f"-arch=sm_{arch}"] + \ 131 | [f"-gencode=arch=compute_{sm},code=sm_{sm}" for sm in sm_list] + \ 132 | [f"-gencode=arch=compute_{compute},code=compute_{compute}", 133 | "--ptxas-options=-v", "-O2"] 134 | print(f"nvcc post args: {post_args}") 135 | if HALF_PRECISION: 136 | post_args = [flag for flag in post_args if "52" not in flag] 137 | 138 | if sys.platform == "win32": 139 | cudaconfig['lib64'] = os.path.join(home, 'lib', 'x64') 140 | post_args += ['-Xcompiler', '/MD', '-std=c++14', "-Xcompiler", "/openmp"] 141 | if HALF_PRECISION: 142 | post_args += ["-Xcompiler", "/D HALF_PRECISION"] 143 | else: 144 | post_args += ['-c', '--compiler-options', "'-fPIC'", 145 | "--compiler-options", "'-std=c++14'"] 146 | if HALF_PRECISION: 147 | post_args += ["--compiler-options", "'-D HALF_PRECISION'"] 148 | for k, val in cudaconfig.items(): 149 | if not os.path.exists(val): 150 | logging.warning('The CUDA %s path could not be located in %s', k, val) 151 | return None 152 | 153 | cudaconfig['post_args'] = post_args 154 | return cudaconfig 155 | 156 | 157 | # This code to build .cu extensions with nvcc is taken from cupy: 158 | # https://github.com/cupy/cupy/blob/master/cupy_setup_build.py 159 | class _UnixCCompiler(unixccompiler.UnixCCompiler): 160 | src_extensions = list(unixccompiler.UnixCCompiler.src_extensions) 161 | src_extensions.append('.cu') 162 | 163 | def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts): 164 | # For sources other than CUDA C ones, just call the super class method. 165 | if os.path.splitext(src)[1] != '.cu': 166 | return unixccompiler.UnixCCompiler._compile( 167 | self, obj, src, ext, cc_args, extra_postargs, pp_opts) 168 | 169 | # For CUDA C source files, compile them with NVCC. 170 | _compiler_so = self.compiler_so 171 | try: 172 | nvcc_path = CUDA['nvcc'] 173 | post_args = CUDA['post_args'] 174 | # TODO? base_opts = build.get_compiler_base_options() 175 | self.set_executable('compiler_so', nvcc_path) 176 | 177 | return unixccompiler.UnixCCompiler._compile( 178 | self, obj, src, ext, cc_args, post_args, pp_opts) 179 | finally: 180 | self.compiler_so = _compiler_so 181 | 182 | 183 | class _MSVCCompiler(msvccompiler.MSVCCompiler): 184 | _cu_extensions = ['.cu'] 185 | 186 | src_extensions = list(unixccompiler.UnixCCompiler.src_extensions) 187 | src_extensions.extend(_cu_extensions) 188 | 189 | def _compile_cu(self, sources, output_dir=None, macros=None, 190 | include_dirs=None, debug=0, extra_preargs=None, 191 | extra_postargs=None, depends=None): 192 | # Compile CUDA C files, mainly derived from UnixCCompiler._compile(). 193 | macros, objects, extra_postargs, pp_opts, _build = \ 194 | self._setup_compile(output_dir, macros, include_dirs, sources, 195 | depends, extra_postargs) 196 | 197 | compiler_so = CUDA['nvcc'] 198 | cc_args = self._get_cc_args(pp_opts, debug, extra_preargs) 199 | post_args = CUDA['post_args'] 200 | 201 | for obj in objects: 202 | try: 203 | src, _ = _build[obj] 204 | except KeyError: 205 | continue 206 | try: 207 | self.spawn([compiler_so] + cc_args + [src, '-o', obj] + post_args) 208 | except errors.DistutilsExecError as e: 209 | raise errors.CompileError(str(e)) 210 | 211 | return objects 212 | 213 | def compile(self, sources, **kwargs): 214 | # Split CUDA C sources and others. 215 | cu_sources = [] 216 | other_sources = [] 217 | for source in sources: 218 | if os.path.splitext(source)[1] == '.cu': 219 | cu_sources.append(source) 220 | else: 221 | other_sources.append(source) 222 | 223 | # Compile source files other than CUDA C ones. 224 | other_objects = msvccompiler.MSVCCompiler.compile( 225 | self, other_sources, **kwargs) 226 | 227 | # Compile CUDA C sources. 228 | cu_objects = self._compile_cu(cu_sources, **kwargs) 229 | 230 | # Return compiled object filenames. 231 | return other_objects + cu_objects 232 | 233 | 234 | class CudaBuildExt(setuptools_build_ext): 235 | """Custom `build_ext` command to include CUDA C source files.""" 236 | 237 | def run(self): 238 | if CUDA is not None: 239 | def wrap_new_compiler(func): 240 | def _wrap_new_compiler(*args, **kwargs): 241 | try: 242 | return func(*args, **kwargs) 243 | except errors.DistutilsPlatformError: 244 | if sys.platform != 'win32': 245 | CCompiler = _UnixCCompiler 246 | else: 247 | CCompiler = _MSVCCompiler 248 | return CCompiler( 249 | None, kwargs['dry_run'], kwargs['force']) 250 | return _wrap_new_compiler 251 | ccompiler.new_compiler = wrap_new_compiler(ccompiler.new_compiler) 252 | # Intentionally causes DistutilsPlatformError in 253 | # ccompiler.new_compiler() function to hook. 254 | self.compiler = 'nvidia' 255 | 256 | setuptools_build_ext.run(self) 257 | 258 | 259 | CUDA = locate_cuda() 260 | assert CUDA is not None 261 | BUILDEXT = CudaBuildExt if CUDA else setuptools_build_ext 262 | -------------------------------------------------------------------------------- /cusim/.gitignore: -------------------------------------------------------------------------------- 1 | config_pb2.py 2 | -------------------------------------------------------------------------------- /cusim/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | from cusim.ioutils import IoUtils 7 | from cusim.culda import CuLDA 8 | from cusim.cuw2v import CuW2V 9 | -------------------------------------------------------------------------------- /cusim/aux.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | import os 7 | import re 8 | import sys 9 | import json 10 | import time 11 | import logging 12 | import logging.handlers 13 | import numpy as np 14 | import jsmin 15 | from google.protobuf.json_format import Parse, MessageToDict 16 | 17 | # get_logger and Option refer to 18 | # https://github.com/kakao/buffalo/blob/ 19 | # 5f571c2c7d8227e6625c6e538da929e4db11b66d/buffalo/misc/aux.py 20 | def get_logger(name=__file__, level=2): 21 | if level == 1: 22 | level = logging.WARNING 23 | elif level == 2: 24 | level = logging.INFO 25 | elif level == 3: 26 | level = logging.DEBUG 27 | logger = logging.getLogger(name) 28 | if logger.handlers: 29 | return logger 30 | logger.setLevel(level) 31 | sh0 = logging.StreamHandler() 32 | sh0.setLevel(level) 33 | formatter = logging.Formatter('[%(levelname)-8s] %(asctime)s ' 34 | '[%(filename)s] [%(funcName)s:%(lineno)d]' 35 | '%(message)s', '%Y-%m-%d %H:%M:%S') 36 | sh0.setFormatter(formatter) 37 | logger.addHandler(sh0) 38 | return logger 39 | 40 | # This function helps you to read non-standard json strings. 41 | # - Handles json string with c++ style inline comments 42 | # - Handles json string with trailing commas. 43 | def load_json_string(cont): 44 | # (1) Removes comment. 45 | # Refer to https://plus.google.com/+DouglasCrockfordEsq/posts/RK8qyGVaGSr 46 | cont = jsmin.jsmin(cont) 47 | 48 | # (2) Removes trailing comma. 49 | cont = re.sub(",[ \t\r\n]*}", "}", cont) 50 | cont = re.sub(",[ \t\r\n]*" + r"\]", "]", cont) 51 | 52 | return json.loads(cont) 53 | 54 | 55 | # function read json file from filename 56 | def load_json_file(fname): 57 | with open(fname, "r") as fin: 58 | ret = load_json_string(fin.read()) 59 | return ret 60 | 61 | # use protobuf to restrict field and types 62 | def get_opt_as_proto(raw, proto_type=None): 63 | assert proto_type is not None 64 | proto = proto_type() 65 | # convert raw to proto 66 | Parse(json.dumps(Option(raw)), proto) 67 | err = [] 68 | assert proto.IsInitialized(err), \ 69 | f"some required fields are missing in proto {err}\n {proto}" 70 | return proto 71 | 72 | def proto_to_dict(proto): 73 | return MessageToDict(proto, \ 74 | including_default_value_fields=True, \ 75 | preserving_proto_field_name=True) 76 | 77 | def copy_proto(proto): 78 | newproto = type(proto)() 79 | Parse(json.dumps(proto_to_dict(proto)), newproto) 80 | return newproto 81 | 82 | class Option(dict): 83 | def __init__(self, *args, **kwargs): 84 | args = [arg if isinstance(arg, dict) 85 | else load_json_file(arg) for arg in args] 86 | super().__init__(*args, **kwargs) 87 | for arg in args: 88 | if isinstance(arg, dict): 89 | for k, val in arg.items(): 90 | if isinstance(val, dict): 91 | self[k] = Option(val) 92 | else: 93 | self[k] = val 94 | if kwargs: 95 | for k, val in kwargs.items(): 96 | if isinstance(val, dict): 97 | self[k] = Option(val) 98 | else: 99 | self[k] = val 100 | 101 | def __getattr__(self, attr): 102 | return self.get(attr) 103 | 104 | def __setattr__(self, key, value): 105 | self.__setitem__(key, value) 106 | 107 | def __setitem__(self, key, value): 108 | super().__setitem__(key, value) 109 | self.__dict__.update({key: value}) 110 | 111 | def __delattr__(self, item): 112 | self.__delitem__(item) 113 | 114 | def __delitem__(self, key): 115 | super().__delitem__(key) 116 | del self.__dict__[key] 117 | 118 | def __getstate__(self): 119 | return vars(self) 120 | 121 | def __setstate__(self, state): 122 | vars(self).update(state) 123 | 124 | # reference: https://github.com/tensorflow/tensorflow/blob/ 125 | # 85c8b2a817f95a3e979ecd1ed95bff1dc1335cff/tensorflow/python/ 126 | # keras/utils/generic_utils.py#L483 127 | class Progbar: 128 | # pylint: disable=too-many-branches,too-many-statements,invalid-name 129 | # pylint: disable=blacklisted-name,no-else-return 130 | """Displays a progress bar. 131 | Arguments: 132 | target: Total number of steps expected, None if unknown. 133 | width: Progress bar width on screen. 134 | verbose: Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose) 135 | stateful_metrics: Iterable of string names of metrics that should *not* be 136 | averaged over time. Metrics in this list will be displayed as-is. All 137 | others will be averaged by the progbar before display. 138 | interval: Minimum visual progress update interval (in seconds). 139 | unit_name: Display name for step counts (usually "step" or "sample"). 140 | """ 141 | 142 | def __init__(self, 143 | target, 144 | width=30, 145 | verbose=1, 146 | interval=0.05, 147 | stateful_metrics=None, 148 | unit_name='step'): 149 | self.target = target 150 | self.width = width 151 | self.verbose = verbose 152 | self.interval = interval 153 | self.unit_name = unit_name 154 | if stateful_metrics: 155 | self.stateful_metrics = set(stateful_metrics) 156 | else: 157 | self.stateful_metrics = set() 158 | 159 | self._dynamic_display = ((hasattr(sys.stdout, 'isatty') and 160 | sys.stdout.isatty()) or 161 | 'ipykernel' in sys.modules or 162 | 'posix' in sys.modules or 163 | 'PYCHARM_HOSTED' in os.environ) 164 | self._total_width = 0 165 | self._seen_so_far = 0 166 | # We use a dict + list to avoid garbage collection 167 | # issues found in OrderedDict 168 | self._values = {} 169 | self._values_order = [] 170 | self._start = time.time() 171 | self._last_update = 0 172 | 173 | self._time_after_first_step = None 174 | 175 | def update(self, current, values=None, finalize=None): 176 | """Updates the progress bar. 177 | Arguments: 178 | current: Index of current step. 179 | values: List of tuples: `(name, value_for_last_step)`. If `name` is in 180 | `stateful_metrics`, `value_for_last_step` will be displayed as-is. 181 | Else, an average of the metric over time will be displayed. 182 | finalize: Whether this is the last update for the progress bar. If 183 | `None`, defaults to `current >= self.target`. 184 | """ 185 | if finalize is None: 186 | if self.target is None: 187 | finalize = False 188 | else: 189 | finalize = current >= self.target 190 | 191 | values = values or [] 192 | for k, v in values: 193 | if k not in self._values_order: 194 | self._values_order.append(k) 195 | if k not in self.stateful_metrics: 196 | # In the case that progress bar doesn't have a target value in the first 197 | # epoch, both on_batch_end and on_epoch_end will be called, which will 198 | # cause 'current' and 'self._seen_so_far' to have the same value. Force 199 | # the minimal value to 1 here, otherwise stateful_metric will be 0s. 200 | value_base = max(current - self._seen_so_far, 1) 201 | if k not in self._values: 202 | self._values[k] = [v * value_base, value_base] 203 | else: 204 | self._values[k][0] += v * value_base 205 | self._values[k][1] += value_base 206 | else: 207 | # Stateful metrics output a numeric value. This representation 208 | # means "take an average from a single value" but keeps the 209 | # numeric formatting. 210 | self._values[k] = [v, 1] 211 | self._seen_so_far = current 212 | 213 | now = time.time() 214 | info = ' - %.0fs' % (now - self._start) 215 | if self.verbose == 1: 216 | if now - self._last_update < self.interval and not finalize: 217 | return 218 | 219 | prev_total_width = self._total_width 220 | if self._dynamic_display: 221 | sys.stdout.write('\b' * prev_total_width) 222 | sys.stdout.write('\r') 223 | else: 224 | sys.stdout.write('\n') 225 | 226 | if self.target is not None: 227 | numdigits = int(np.log10(self.target)) + 1 228 | bar = ('%' + str(numdigits) + 'd/%d [') % (current, self.target) 229 | prog = float(current) / self.target 230 | prog_width = int(self.width * prog) 231 | if prog_width > 0: 232 | bar += ('=' * (prog_width - 1)) 233 | if current < self.target: 234 | bar += '>' 235 | else: 236 | bar += '=' 237 | bar += ('.' * (self.width - prog_width)) 238 | bar += ']' 239 | else: 240 | bar = '%7d/Unknown' % current 241 | 242 | self._total_width = len(bar) 243 | sys.stdout.write(bar) 244 | 245 | time_per_unit = self._estimate_step_duration(current, now) 246 | 247 | if self.target is None or finalize: 248 | if time_per_unit >= 1 or time_per_unit == 0: 249 | info += ' %.0fs/%s' % (time_per_unit, self.unit_name) 250 | elif time_per_unit >= 1e-3: 251 | info += ' %.0fms/%s' % (time_per_unit * 1e3, self.unit_name) 252 | else: 253 | info += ' %.0fus/%s' % (time_per_unit * 1e6, self.unit_name) 254 | else: 255 | eta = time_per_unit * (self.target - current) 256 | if eta > 3600: 257 | eta_format = '%d:%02d:%02d' % (eta // 3600, 258 | (eta % 3600) // 60, eta % 60) 259 | elif eta > 60: 260 | eta_format = '%d:%02d' % (eta // 60, eta % 60) 261 | else: 262 | eta_format = '%ds' % eta 263 | 264 | info = ' - ETA: %s' % eta_format 265 | 266 | for k in self._values_order: 267 | info += ' - %s:' % k 268 | if isinstance(self._values[k], list): 269 | avg = np.mean(self._values[k][0] / max(1, self._values[k][1])) 270 | if abs(avg) > 1e-3: 271 | info += ' %.4f' % avg 272 | else: 273 | info += ' %.4e' % avg 274 | else: 275 | info += ' %s' % self._values[k] 276 | 277 | self._total_width += len(info) 278 | if prev_total_width > self._total_width: 279 | info += (' ' * (prev_total_width - self._total_width)) 280 | 281 | if finalize: 282 | info += '\n' 283 | 284 | sys.stdout.write(info) 285 | sys.stdout.flush() 286 | 287 | elif self.verbose == 2: 288 | if finalize: 289 | numdigits = int(np.log10(self.target)) + 1 290 | count = ('%' + str(numdigits) + 'd/%d') % (current, self.target) 291 | info = count + info 292 | for k in self._values_order: 293 | info += ' - %s:' % k 294 | avg = np.mean(self._values[k][0] / max(1, self._values[k][1])) 295 | if avg > 1e-3: 296 | info += ' %.4f' % avg 297 | else: 298 | info += ' %.4e' % avg 299 | info += '\n' 300 | 301 | sys.stdout.write(info) 302 | sys.stdout.flush() 303 | 304 | self._last_update = now 305 | 306 | def add(self, n, values=None): 307 | self.update(self._seen_so_far + n, values) 308 | 309 | def _estimate_step_duration(self, current, now): 310 | """Estimate the duration of a single step. 311 | Given the step number `current` and the corresponding time `now` 312 | this function returns an estimate for how long a single step 313 | takes. If this is called before one step has been completed 314 | (i.e. `current == 0`) then zero is given as an estimate. The duration 315 | estimate ignores the duration of the (assumed to be non-representative) 316 | first step for estimates when more steps are available (i.e. `current>1`). 317 | Arguments: 318 | current: Index of current step. 319 | now: The current time. 320 | Returns: Estimate of the duration of a single step. 321 | """ 322 | if current: 323 | # there are a few special scenarios here: 324 | # 1) somebody is calling the progress bar without ever supplying step 1 325 | # 2) somebody is calling the progress bar and supplies step one mulitple 326 | # times, e.g. as part of a finalizing call 327 | # in these cases, we just fall back to the simple calculation 328 | if self._time_after_first_step is not None and current > 1: 329 | time_per_unit = (now - self._time_after_first_step) / (current - 1) 330 | else: 331 | time_per_unit = (now - self._start) / current 332 | 333 | if current == 1: 334 | self._time_after_first_step = now 335 | return time_per_unit 336 | else: 337 | return 0 338 | -------------------------------------------------------------------------------- /cusim/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pylint: disable=no-name-in-module,too-few-public-methods,no-member 8 | 9 | EPS = 1e-10 10 | WARP_SIZE = 32 11 | -------------------------------------------------------------------------------- /cusim/culda/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | from cusim.culda.pyculda import CuLDA 7 | -------------------------------------------------------------------------------- /cusim/culda/bindings.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include "culda/culda.hpp" 12 | 13 | namespace py = pybind11; 14 | 15 | typedef py::array_t float_array; 16 | typedef py::array_t int_array; 17 | typedef py::array_t bool_array; 18 | 19 | class CuLDABind { 20 | public: 21 | CuLDABind() {} 22 | 23 | bool Init(std::string opt_path) { 24 | return obj_.Init(opt_path); 25 | } 26 | 27 | void LoadModel(py::object& alpha, py::object& beta, 28 | py::object& grad_alpha, py::object& new_beta) { 29 | // check shape of alpha and beta 30 | float_array _alpha(alpha); 31 | float_array _beta(beta); 32 | auto alpha_buffer = _alpha.request(); 33 | auto beta_buffer = _beta.request(); 34 | if (alpha_buffer.ndim != 1 or beta_buffer.ndim != 2 or 35 | alpha_buffer.shape[0] != beta_buffer.shape[1]) { 36 | throw std::runtime_error("invalid alpha or beta"); 37 | } 38 | 39 | // check shape of grad alpha and new beta 40 | float_array _grad_alpha(grad_alpha); 41 | float_array _new_beta(new_beta); 42 | auto grad_alpha_buffer = _grad_alpha.request(); 43 | auto new_beta_buffer = _new_beta.request(); 44 | if (grad_alpha_buffer.ndim != 2 or 45 | new_beta_buffer.ndim != 2 or 46 | grad_alpha_buffer.shape[1] != new_beta_buffer.shape[1]) { 47 | throw std::runtime_error("invalid grad_alpha or new_beta"); 48 | } 49 | 50 | int num_words = beta_buffer.shape[0]; 51 | 52 | return obj_.LoadModel(_alpha.mutable_data(0), 53 | _beta.mutable_data(0), 54 | _grad_alpha.mutable_data(0), 55 | _new_beta.mutable_data(0), num_words); 56 | } 57 | 58 | std::pair FeedData(py::object& cols, 59 | py::object& indptr, py::object& vali, py::object& counts, 60 | py::object& gamma, const bool init_gamma, 61 | const int num_iters) { 62 | int_array _cols(cols); 63 | int_array _indptr(indptr); 64 | bool_array _vali(vali); 65 | float_array _counts(counts); 66 | float_array _gamma(gamma); 67 | auto cols_buffer = _cols.request(); 68 | auto indptr_buffer = _indptr.request(); 69 | auto vali_buffer = _vali.request(); 70 | auto counts_buffer = _counts.request(); 71 | auto gamma_buffer = _gamma.request(); 72 | if (cols_buffer.ndim != 1 or 73 | indptr_buffer.ndim != 1 or 74 | vali_buffer.ndim != 1 or 75 | counts_buffer.ndim != 1 or 76 | gamma_buffer.ndim != 2) { 77 | throw std::runtime_error("invalid ndim"); 78 | } 79 | int num_cols = cols_buffer.shape[0]; 80 | int num_indptr = indptr_buffer.shape[0] - 1; 81 | 82 | if (vali_buffer.shape[0] != num_cols or 83 | counts_buffer.shape[0] != num_cols or 84 | gamma_buffer.shape[0] != num_indptr) { 85 | throw std::runtime_error("invalid length"); 86 | } 87 | return obj_.FeedData(_cols.data(0), _indptr.data(0), 88 | _vali.data(0), _counts.data(0), _gamma.mutable_data(0), 89 | init_gamma, num_cols, num_indptr, num_iters); 90 | } 91 | 92 | void Pull() { 93 | obj_.Pull(); 94 | } 95 | 96 | void Push() { 97 | obj_.Push(); 98 | } 99 | 100 | int GetBlockCnt() { 101 | return obj_.GetBlockCnt(); 102 | } 103 | 104 | private: 105 | cusim::CuLDA obj_; 106 | }; 107 | 108 | PYBIND11_PLUGIN(culda_bind) { 109 | py::module m("CuLDABind"); 110 | 111 | py::class_(m, "CuLDABind") 112 | .def(py::init()) 113 | .def("init", &CuLDABind::Init, py::arg("opt_path")) 114 | .def("load_model", &CuLDABind::LoadModel, 115 | py::arg("alpha"), py::arg("beta"), 116 | py::arg("grad_alpha"), py::arg("new_beta")) 117 | .def("feed_data", &CuLDABind::FeedData, 118 | py::arg("cols"), py::arg("indptr"), py::arg("vali"), 119 | py::arg("counts"), py::arg("gamma"), 120 | py::arg("init_gamma"), py::arg("num_iters")) 121 | .def("pull", &CuLDABind::Pull) 122 | .def("push", &CuLDABind::Push) 123 | .def("get_block_cnt", &CuLDABind::GetBlockCnt) 124 | .def("__repr__", 125 | [](const CuLDABind &a) { 126 | return ""; 127 | } 128 | ); 129 | return m.ptr(); 130 | } 131 | -------------------------------------------------------------------------------- /cusim/culda/pyculda.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pylint: disable=no-name-in-module,too-few-public-methods,no-member 8 | import os 9 | from os.path import join as pjoin 10 | 11 | import json 12 | import atexit 13 | import shutil 14 | import tempfile 15 | 16 | import h5py 17 | import numpy as np 18 | from scipy.special import polygamma as pg 19 | 20 | from cusim import aux, IoUtils 21 | from cusim.culda.culda_bind import CuLDABind 22 | from cusim.config_pb2 import CuLDAConfigProto 23 | from cusim.constants import EPS, WARP_SIZE 24 | 25 | 26 | class CuLDA: 27 | def __init__(self, opt=None): 28 | self.opt = aux.get_opt_as_proto(opt or {}, CuLDAConfigProto) 29 | self.logger = aux.get_logger("culda", level=self.opt.py_log_level) 30 | 31 | assert self.opt.block_dim <= WARP_SIZE ** 2 and \ 32 | self.opt.block_dim % WARP_SIZE == 0, \ 33 | f"invalid block dim ({self.opt.block_dim}, warp size: {WARP_SIZE})" 34 | 35 | tmp = tempfile.NamedTemporaryFile(mode='w', delete=False) 36 | opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2) 37 | tmp.write(opt_content) 38 | tmp.close() 39 | 40 | self.logger.info("opt: %s", opt_content) 41 | self.obj = CuLDABind() 42 | assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}" 43 | os.remove(tmp.name) 44 | 45 | self.words, self.num_words, self.num_docs = None, None, None 46 | self.alpha, self.beta, self.grad_alpha, self.new_beta = \ 47 | None, None, None, None 48 | 49 | self.tmp_dirs = [] 50 | atexit.register(self.remove_tmp) 51 | 52 | def preprocess_data(self): 53 | if self.opt.skip_preprocess: 54 | return 55 | iou = IoUtils(aux.proto_to_dict(self.opt.io)) 56 | if not self.opt.processed_data_path: 57 | data_dir = tempfile.TemporaryDirectory().name 58 | self.tmp_dirs.append(data_dir) 59 | self.opt.processed_data_path = pjoin(data_dir, "token.h5") 60 | iou.convert_bow_to_h5(self.opt.data_path, self.opt.processed_data_path) 61 | 62 | def init_model(self): 63 | # count number of docs and load voca 64 | assert os.path.exists(self.opt.processed_data_path) 65 | assert os.path.exists(self.opt.keys_path) 66 | h5f = h5py.File(self.opt.processed_data_path, "r") 67 | self.num_docs = h5f["indptr"].shape[0] - 1 68 | h5f.close() 69 | with open(self.opt.keys_path, "rb") as fin: 70 | self.words = [line.strip().decode("utf8") for line in fin] 71 | self.num_words = len(self.words) 72 | 73 | self.logger.info("number of words: %d, docs: %d", 74 | self.num_words, self.num_docs) 75 | 76 | # random initialize alpha and beta 77 | np.random.seed(self.opt.seed) 78 | self.alpha = np.random.uniform( \ 79 | size=(self.opt.num_topics,)).astype(np.float32) 80 | self.beta = np.random.uniform( \ 81 | size=(self.num_words, self.opt.num_topics)).astype(np.float32) 82 | self.beta /= np.sum(self.beta, axis=0)[None, :] 83 | self.logger.info("alpha %s, beta %s initialized", 84 | self.alpha.shape, self.beta.shape) 85 | 86 | # zero initialize grad alpha and new beta 87 | block_cnt = self.obj.get_block_cnt() 88 | self.grad_alpha = np.zeros(shape=(block_cnt, self.opt.num_topics), 89 | dtype=np.float32) 90 | self.new_beta = np.zeros(shape=self.beta.shape, dtype=np.float32) 91 | self.logger.info("grad alpha %s, new beta %s initialized", 92 | self.grad_alpha.shape, self.new_beta.shape) 93 | 94 | # set h5 file path to backup gamma 95 | if not self.opt.gamma_path: 96 | data_dir = tempfile.TemporaryDirectory().name 97 | self.tmp_dirs.append(data_dir) 98 | self.opt.gamma_path = pjoin(data_dir, "gamma.h5") 99 | self.logger.info("backup gamma to %s", self.opt.gamma_path) 100 | os.makedirs(os.path.dirname(self.opt.gamma_path), exist_ok=True) 101 | h5f = h5py.File(self.opt.gamma_path, "w") 102 | h5f.create_dataset("gamma", shape=(self.num_docs, self.opt.num_topics), 103 | dtype=np.float32) 104 | h5f.close() 105 | 106 | # push it to gpu 107 | self.obj.load_model(self.alpha, self.beta, self.grad_alpha, self.new_beta) 108 | 109 | def train_model(self): 110 | self.preprocess_data() 111 | self.init_model() 112 | h5f = h5py.File(self.opt.processed_data_path, "r") 113 | for epoch in range(1, self.opt.epochs + 1): 114 | gamma_h5f = h5py.File(self.opt.gamma_path, "r+") 115 | self.logger.info("Epoch %d / %d", epoch, self.opt.epochs) 116 | self._train_e_step(h5f, gamma_h5f["gamma"], epoch) 117 | self._train_m_step() 118 | gamma_h5f.close() 119 | h5f.close() 120 | 121 | def _train_e_step(self, h5f, gamma_h5f, epoch): 122 | offset, size = 0, h5f["cols"].shape[0] 123 | pbar = aux.Progbar(size, stateful_metrics=["train_loss", "vali_loss"]) 124 | train_loss_nume, train_loss_deno = 0, 0 125 | vali_loss_nume, vali_loss_deno = 0, 0 126 | while True: 127 | target = h5f["indptr"][offset] + self.opt.batch_size 128 | if target < size: 129 | next_offset = h5f["rows"][target] 130 | else: 131 | next_offset = h5f["indptr"].shape[0] - 1 132 | indptr = h5f["indptr"][offset:next_offset + 1] 133 | beg, end = indptr[0], indptr[-1] 134 | indptr -= beg 135 | cols = h5f["cols"][beg:end] 136 | counts = h5f["counts"][beg:end] 137 | vali = (h5f["vali"][beg:end] < self.opt.vali_p).astype(np.bool) 138 | gamma = gamma_h5f[offset:next_offset, :] 139 | 140 | # call cuda kernel 141 | train_loss, vali_loss = \ 142 | self.obj.feed_data(cols, indptr.astype(np.int32), 143 | vali, counts, gamma, 144 | epoch == 1 or self.opt.reuse_gamma, 145 | self.opt.num_iters_in_e_step) 146 | 147 | gamma_h5f[offset:next_offset, :] = gamma 148 | # accumulate loss 149 | train_loss_nume -= train_loss 150 | vali_loss_nume -= vali_loss 151 | train_loss_deno += np.sum(counts[~vali]) 152 | vali_loss_deno += np.sum(counts[vali]) 153 | train_loss = train_loss_nume / (train_loss_deno + EPS) 154 | vali_loss = vali_loss_nume / (vali_loss_deno + EPS) 155 | 156 | # update progress bar 157 | pbar.update(end, values=[("train_loss", train_loss), 158 | ("vali_loss", vali_loss)]) 159 | offset = next_offset 160 | 161 | if end == size: 162 | break 163 | 164 | def _train_m_step(self): 165 | self.obj.pull() 166 | 167 | # update beta 168 | self.new_beta[:, :] = np.maximum(self.new_beta, EPS) 169 | self.beta[:, :] = self.new_beta / np.sum(self.new_beta, axis=0)[None, :] 170 | self.new_beta[:, :] = 0 171 | 172 | # update alpha 173 | alpha_sum = np.sum(self.alpha) 174 | gvec = np.sum(self.grad_alpha, axis=0) 175 | gvec += self.num_docs * (pg(0, alpha_sum) - pg(0, self.alpha)) 176 | hvec = self.num_docs * pg(1, self.alpha) 177 | z_0 = pg(1, alpha_sum) 178 | c_nume = np.sum(gvec / hvec) 179 | c_deno = 1 / z_0 + np.sum(1 / hvec) 180 | c_0 = c_nume / c_deno 181 | delta = (gvec - c_0) / hvec 182 | self.alpha -= delta 183 | self.alpha[:] = np.maximum(self.alpha, EPS) 184 | self.grad_alpha[:,:] = 0 185 | 186 | self.obj.push() 187 | 188 | def save_h5_model(self, filepath, chunk_size=10000): 189 | self.logger.info("save h5 format model path to %s", filepath) 190 | os.makedirs(os.path.dirname(filepath), exist_ok=True) 191 | h5f = h5py.File(filepath, "w") 192 | h5f.create_dataset("alpha", data=self.alpha) 193 | h5f.create_dataset("beta", data=self.beta) 194 | h5f.create_dataset("keys", data=np.array([word.encode("utf") 195 | for word in self.words])) 196 | gamma = h5f.create_dataset("gamma", dtype=np.float32, 197 | shape=(self.num_docs, self.opt.num_topics)) 198 | h5f_gamma = h5py.File(self.opt.gamma_path, "r") 199 | for offset in range(0, self.num_docs, chunk_size): 200 | next_offset = min(self.num_docs, offset + chunk_size) 201 | gamma[offset:next_offset, :] = h5f_gamma["gamma"][offset:next_offset, :] 202 | h5f_gamma.close() 203 | h5f.close() 204 | 205 | def remove_tmp(self): 206 | if not self.opt.remove_tmp: 207 | return 208 | for tmp_dir in self.tmp_dirs: 209 | if os.path.exists(tmp_dir): 210 | self.logger.info("remove %s", tmp_dir) 211 | shutil.rmtree(tmp_dir) 212 | -------------------------------------------------------------------------------- /cusim/cuw2v/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | from cusim.cuw2v.pycuw2v import CuW2V 7 | -------------------------------------------------------------------------------- /cusim/cuw2v/bindings.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include "cuw2v/cuw2v.hpp" 12 | 13 | namespace py = pybind11; 14 | 15 | typedef py::array_t float_array; 16 | typedef py::array_t double_array; 17 | typedef py::array_t int_array; 18 | 19 | class CuW2VBind { 20 | public: 21 | CuW2VBind() {} 22 | 23 | bool Init(std::string opt_path) { 24 | return obj_.Init(opt_path); 25 | } 26 | 27 | void LoadModel(py::object& emb_in, py::object& emb_out) { 28 | // check shape of alpha and beta 29 | float_array _emb_in(emb_in); 30 | float_array _emb_out(emb_out); 31 | auto emb_in_buffer = _emb_in.request(); 32 | auto emb_out_buffer = _emb_out.request(); 33 | if (emb_in_buffer.ndim != 2 or emb_out_buffer.ndim != 2 or 34 | emb_in_buffer.shape[1] != emb_out_buffer.shape[1]) { 35 | throw std::runtime_error("invalid emb_in or emb_out"); 36 | } 37 | 38 | return obj_.LoadModel(_emb_in.mutable_data(0), _emb_out.mutable_data(0)); 39 | } 40 | 41 | void BuildRandomTable(py::object& word_count, int table_size) { 42 | double_array _word_count(word_count); 43 | auto wc_buffer = _word_count.request(); 44 | if (wc_buffer.ndim != 1) { 45 | throw std::runtime_error("invalid word count"); 46 | } 47 | int num_words = wc_buffer.shape[0]; 48 | obj_.BuildRandomTable(_word_count.data(0), num_words, table_size); 49 | } 50 | 51 | void BuildHuffmanTree(py::object& word_count) { 52 | float_array _word_count(word_count); 53 | auto wc_buffer = _word_count.request(); 54 | if (wc_buffer.ndim != 1) { 55 | throw std::runtime_error("invalid word count"); 56 | } 57 | int num_words = wc_buffer.shape[0]; 58 | obj_.BuildHuffmanTree(_word_count.data(0), num_words); 59 | } 60 | 61 | std::pair FeedData(py::object& cols, py::object& indptr) { 62 | int_array _cols(cols); 63 | int_array _indptr(indptr); 64 | auto cols_buffer = _cols.request(); 65 | auto indptr_buffer = _indptr.request(); 66 | if (cols_buffer.ndim != 1 or indptr_buffer.ndim != 1) { 67 | throw std::runtime_error("invalid cols or indptr"); 68 | } 69 | int num_cols = cols_buffer.shape[0]; 70 | int num_indptr = indptr_buffer.shape[0] - 1; 71 | return obj_.FeedData(_cols.data(0), _indptr.data(0), num_cols, num_indptr); 72 | } 73 | 74 | void Pull() { 75 | obj_.Pull(); 76 | } 77 | 78 | private: 79 | cusim::CuW2V obj_; 80 | }; 81 | 82 | PYBIND11_PLUGIN(cuw2v_bind) { 83 | py::module m("CuW2VBind"); 84 | 85 | py::class_(m, "CuW2VBind") 86 | .def(py::init()) 87 | .def("init", &CuW2VBind::Init, py::arg("opt_path")) 88 | .def("load_model", &CuW2VBind::LoadModel, 89 | py::arg("emb_in"), py::arg("emb_out")) 90 | .def("feed_data", &CuW2VBind::FeedData, 91 | py::arg("cols"), py::arg("indptr")) 92 | .def("pull", &CuW2VBind::Pull) 93 | .def("build_random_table", &CuW2VBind::BuildRandomTable, 94 | py::arg("word_count"), py::arg("table_size")) 95 | .def("build_huffman_tree", &CuW2VBind::BuildHuffmanTree, 96 | py::arg("word_count")) 97 | .def("__repr__", 98 | [](const CuW2VBind &a) { 99 | return ""; 100 | } 101 | ); 102 | return m.ptr(); 103 | } 104 | -------------------------------------------------------------------------------- /cusim/cuw2v/pycuw2v.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pylint: disable=no-name-in-module,too-few-public-methods,no-member 8 | import os 9 | from os.path import join as pjoin 10 | 11 | import json 12 | import atexit 13 | import shutil 14 | import tempfile 15 | 16 | import h5py 17 | import numpy as np 18 | 19 | from cusim import aux, IoUtils 20 | from cusim.cuw2v.cuw2v_bind import CuW2VBind 21 | from cusim.config_pb2 import CuW2VConfigProto 22 | from cusim.constants import EPS, WARP_SIZE 23 | 24 | class CuW2V: 25 | def __init__(self, opt=None): 26 | self.opt = aux.get_opt_as_proto(opt or {}, CuW2VConfigProto) 27 | self.logger = aux.get_logger("culda", level=self.opt.py_log_level) 28 | 29 | assert self.opt.block_dim <= WARP_SIZE ** 2 and \ 30 | self.opt.block_dim % WARP_SIZE == 0, \ 31 | f"invalid block dim ({self.opt.block_dim}, warp size: {WARP_SIZE})" 32 | 33 | tmp = tempfile.NamedTemporaryFile(mode='w', delete=False) 34 | opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2) 35 | tmp.write(opt_content) 36 | tmp.close() 37 | 38 | self.logger.info("opt: %s", opt_content) 39 | self.obj = CuW2VBind() 40 | assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}" 41 | os.remove(tmp.name) 42 | 43 | self.words, self.word_count, self.num_words, self.num_docs = \ 44 | None, None, None, None 45 | self.emb_in, self.emb_out = None, None 46 | self.tmp_dirs = [] 47 | atexit.register(self.remove_tmp) 48 | 49 | def preprocess_data(self): 50 | if self.opt.skip_preprocess: 51 | return 52 | iou = IoUtils(aux.proto_to_dict(self.opt.io)) 53 | if not self.opt.processed_data_dir: 54 | self.opt.processed_data_dir = tempfile.TemporaryDirectory().name 55 | self.tmp_dirs.append(self.opt.processed_data_dir) 56 | iou.convert_stream_to_h5(self.opt.data_path, self.opt.word_min_count, 57 | self.opt.processed_data_dir) 58 | 59 | def init_model(self): 60 | # load voca 61 | data_dir = self.opt.processed_data_dir 62 | keys_path = pjoin(data_dir, "keys.txt") 63 | count_path = pjoin(data_dir, "count.txt") 64 | self.logger.info("load key, count from %s, %s", keys_path, count_path) 65 | with open(keys_path, "rb") as fin: 66 | self.words = [line.strip().decode("utf8") for line in fin] 67 | with open(count_path, "rb") as fin: 68 | self.word_count = np.array([int(line.strip()) for line in fin], 69 | dtype=np.int64) 70 | self.num_words = len(self.words) 71 | assert len(self.words) == len(self.word_count) 72 | 73 | # count number of docs 74 | h5f = h5py.File(pjoin(data_dir, "token.h5"), "r") 75 | self.num_docs = h5f["indptr"].shape[0] - 1 76 | h5f.close() 77 | 78 | self.logger.info("number of words: %d, docs: %d", 79 | self.num_words, self.num_docs) 80 | 81 | # normalize word count 82 | word_count = np.power(self.word_count, self.opt.count_power, 83 | dtype=np.float64) 84 | word_count /= np.sum(word_count) 85 | if self.opt.neg: 86 | self.obj.build_random_table(word_count, self.opt.random_size) 87 | else: 88 | self.obj.build_huffman_tree(word_count.astype(np.float32)) 89 | 90 | # random initialize alpha and beta 91 | np.random.seed(self.opt.seed) 92 | scale = 1 / np.sqrt(self.opt.num_dims) 93 | self.emb_in = np.random.normal(loc=0, scale=scale, \ 94 | size=(self.num_words, self.opt.num_dims)).astype(np.float32) 95 | out_words = self.num_words if self.opt.neg else self.num_words - 1 96 | self.emb_out = np.random.normal(loc=0, scale=scale, \ 97 | size=(out_words, self.opt.num_dims)).astype(np.float32) 98 | self.logger.info("emb_in %s, emb_out %s initialized", 99 | self.emb_in.shape, self.emb_out.shape) 100 | 101 | if self.opt.pretrained_model.filename: 102 | self.load_word2vec_format(**aux.proto_to_dict(self.opt.pretrained_model)) 103 | 104 | # push it to gpu 105 | self.obj.load_model(self.emb_in, self.emb_out) 106 | 107 | def train_model(self): 108 | self.preprocess_data() 109 | self.init_model() 110 | h5f = h5py.File(pjoin(self.opt.processed_data_dir, "token.h5"), "r") 111 | for epoch in range(1, self.opt.epochs + 1): 112 | self.logger.info("Epoch %d / %d", epoch, self.opt.epochs) 113 | self._train_epoch(h5f) 114 | self.obj.pull() 115 | h5f.close() 116 | 117 | def _train_epoch(self, h5f): 118 | offset, size = 0, h5f["cols"].shape[0] 119 | pbar = aux.Progbar(size, stateful_metrics=["loss"]) 120 | loss_nume, loss_deno = 0, 0 121 | while True: 122 | target = h5f["indptr"][offset] + self.opt.batch_size 123 | if target < size: 124 | next_offset = h5f["rows"][target] 125 | else: 126 | next_offset = h5f["indptr"].shape[0] - 1 127 | indptr = h5f["indptr"][offset:next_offset + 1] 128 | beg, end = indptr[0], indptr[-1] 129 | indptr -= beg 130 | cols = h5f["cols"][beg:end] 131 | offset = next_offset 132 | 133 | # call cuda kernel 134 | _loss_nume, _loss_deno = \ 135 | self.obj.feed_data(cols, indptr.astype(np.int32)) 136 | 137 | # accumulate loss 138 | loss_nume += _loss_nume 139 | loss_deno += _loss_deno 140 | loss = loss_nume / (loss_deno + EPS) 141 | 142 | # update progress bar 143 | pbar.update(end, values=[("loss", loss)]) 144 | if end == size: 145 | break 146 | 147 | def save_h5_model(self, filename): 148 | self.logger.info("save h5 format model to %s", filename) 149 | os.makedirs(os.path.dirname(filename), exist_ok=True) 150 | h5f = h5py.File(filename, "w") 151 | h5f.create_dataset("emb_in", data=self.emb_in) 152 | h5f.create_dataset("emb_out", data=self.emb_out) 153 | h5f.create_dataset("keys", data=np.array([word.encode("utf") 154 | for word in self.words])) 155 | h5f.close() 156 | 157 | def save_word2vec_format(self, filename, binary=False, prefix=""): 158 | self.logger.info("save word2vec format model to %s, " 159 | "binary: %s, prefix: '%s'", filename, binary, prefix) 160 | # save model compatible with gensim and original w2v code by Google 161 | with open(filename, "wb") as fout: 162 | fout.write(f"{self.num_words} {self.opt.num_dims}\n".encode("utf8")) 163 | for idx, word in enumerate(self.words): 164 | vec = self.emb_in[idx] 165 | if binary: 166 | fout.write(f"{prefix}{word} ".encode("utf8") + vec.tobytes()) 167 | else: 168 | fout.write(f"{prefix}{word} " 169 | f"{' '.join(repr(val) for val in vec)}\n".encode("utf8")) 170 | 171 | def load_word2vec_format(self, filename, binary=False, 172 | symmetry=False, no_header=False): 173 | self.logger.info("load pretrained model from %s", filename) 174 | # copy pretrained model to emb_out as well only if 175 | # we use negative sampling, NOT hierarchical softmax 176 | assert not symmetry or self.opt.neg, "no symmetry in hierarchical softmax" 177 | 178 | # read variable 179 | vector_dict = {} 180 | with open(filename, "rb") as fin: 181 | if not no_header: 182 | fin.readline() # throw one line 183 | for line in fin: 184 | if binary: 185 | key, vec = line.split() 186 | vector_dict[key] = np.fromstring(vec, dtype=np.float32) 187 | else: 188 | line_vec = line.strip().split() 189 | key = line_vec[0].decode("utf8") 190 | vec = np.array([float(val) for val in line_vec[1:]], 191 | dtype=np.float32) 192 | vector_dict[key] = vec 193 | 194 | # copy to variable 195 | loaded_cnt = 0 196 | word_idmap = {word: idx for idx, word in enumerate(self.words)} 197 | for key, vec in vector_dict.items(): 198 | assert len(vec) == self.opt.num_dims 199 | if key not in word_idmap: 200 | continue 201 | idx = word_idmap[key] 202 | loaded_cnt += 1 203 | self.emb_in[idx, :] = vec 204 | if symmetry: 205 | self.emb_out[idx, :] = vec 206 | self.logger.info("loaded count: %d", loaded_cnt) 207 | 208 | def remove_tmp(self): 209 | if not self.opt.remove_tmp: 210 | return 211 | for tmp_dir in self.tmp_dirs: 212 | if os.path.exists(tmp_dir): 213 | self.logger.info("remove %s", tmp_dir) 214 | shutil.rmtree(tmp_dir) 215 | -------------------------------------------------------------------------------- /cusim/ioutils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | from cusim.ioutils.pyioutils import IoUtils 7 | -------------------------------------------------------------------------------- /cusim/ioutils/bindings.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include "utils/ioutils.hpp" 12 | 13 | namespace py = pybind11; 14 | 15 | typedef py::array_t float_array; 16 | typedef py::array_t int_array; 17 | typedef py::array_t int64_array; 18 | 19 | class IoUtilsBind { 20 | public: 21 | IoUtilsBind() {} 22 | 23 | bool Init(std::string opt_path) { 24 | return obj_.Init(opt_path); 25 | } 26 | 27 | int64_t LoadStreamFile(std::string filepath) { 28 | return obj_.LoadStreamFile(filepath); 29 | } 30 | 31 | std::pair ReadStreamForVocab(int num_lines, int num_threads) { 32 | return obj_.ReadStreamForVocab(num_lines, num_threads); 33 | } 34 | 35 | std::pair TokenizeStream(int num_lines, int num_threads) { 36 | return obj_.TokenizeStream(num_lines, num_threads); 37 | } 38 | 39 | void GetWordVocab(int min_count, std::string keys_path, std::string count_path) { 40 | obj_.GetWordVocab(min_count, keys_path, count_path); 41 | } 42 | 43 | void GetToken(py::object& rows, py::object& cols, py::object& indptr) { 44 | int_array _rows(rows); 45 | int_array _cols(cols); 46 | int_array _indptr(indptr); 47 | obj_.GetToken(_rows.mutable_data(0), _cols.mutable_data(0), _indptr.mutable_data(0)); 48 | } 49 | 50 | std::tuple ReadBagOfWordsHeader(std::string filepath) { 51 | return obj_.ReadBagOfWordsHeader(filepath); 52 | } 53 | 54 | void ReadBagOfWordsContent(py::object& rows, py::object& cols, 55 | py::object counts) { 56 | int64_array _rows(rows); 57 | int_array _cols(cols); 58 | float_array _counts(counts); 59 | auto rows_buffer = _rows.request(); 60 | auto cols_buffer = _cols.request(); 61 | auto counts_buffer = _counts.request(); 62 | int num_lines = rows_buffer.shape[0]; 63 | if (cols_buffer.shape[0] != num_lines or counts_buffer.shape[0] != num_lines) { 64 | throw std::runtime_error("invalid shape"); 65 | } 66 | obj_.ReadBagOfWordsContent(_rows.mutable_data(0), 67 | _cols.mutable_data(0), _counts.mutable_data(0), num_lines); 68 | } 69 | 70 | private: 71 | cusim::IoUtils obj_; 72 | }; 73 | 74 | PYBIND11_PLUGIN(ioutils_bind) { 75 | py::module m("IoUtilsBind"); 76 | 77 | py::class_(m, "IoUtilsBind") 78 | .def(py::init()) 79 | .def("init", &IoUtilsBind::Init, py::arg("opt_path")) 80 | .def("load_stream_file", &IoUtilsBind::LoadStreamFile, py::arg("filepath")) 81 | .def("read_stream_for_vocab", &IoUtilsBind::ReadStreamForVocab, 82 | py::arg("num_lines"), py::arg("num_threads")) 83 | .def("tokenize_stream", &IoUtilsBind::TokenizeStream, 84 | py::arg("num_lines"), py::arg("num_threads")) 85 | .def("get_word_vocab", &IoUtilsBind::GetWordVocab, 86 | py::arg("min_count"), py::arg("keys_path"), py::arg("count_path")) 87 | .def("get_token", &IoUtilsBind::GetToken, 88 | py::arg("indices"), py::arg("indptr"), py::arg("offset")) 89 | .def("read_bag_of_words_header", &IoUtilsBind::ReadBagOfWordsHeader, 90 | py::arg("filepath")) 91 | .def("read_bag_of_words_content", &IoUtilsBind::ReadBagOfWordsContent, 92 | py::arg("rows"), py::arg("cols"), py::arg("counts")) 93 | .def("__repr__", 94 | [](const IoUtilsBind &a) { 95 | return ""; 96 | } 97 | ); 98 | return m.ptr(); 99 | } 100 | -------------------------------------------------------------------------------- /cusim/ioutils/pyioutils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pylint: disable=no-name-in-module,too-few-public-methods,no-member 8 | import os 9 | from os.path import join as pjoin 10 | 11 | import json 12 | import tempfile 13 | 14 | import h5py 15 | import numpy as np 16 | 17 | from cusim import aux 18 | from cusim.ioutils.ioutils_bind import IoUtilsBind 19 | from cusim.config_pb2 import IoUtilsConfigProto 20 | 21 | class IoUtils: 22 | def __init__(self, opt=None): 23 | self.opt = aux.get_opt_as_proto(opt or {}, IoUtilsConfigProto) 24 | self.logger = aux.get_logger("ioutils", level=self.opt.py_log_level) 25 | 26 | tmp = tempfile.NamedTemporaryFile(mode='w', delete=False) 27 | opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2) 28 | tmp.write(opt_content) 29 | tmp.close() 30 | 31 | self.logger.info("opt: %s", opt_content) 32 | self.obj = IoUtilsBind() 33 | assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}" 34 | os.remove(tmp.name) 35 | 36 | def load_stream_vocab(self, filepath, min_count, 37 | keys_path, count_path): 38 | full_num_lines = self.obj.load_stream_file(filepath) 39 | pbar = aux.Progbar(full_num_lines, unit_name="line", 40 | stateful_metrics=["word_count"]) 41 | processed = 0 42 | while True: 43 | read_lines, word_count = \ 44 | self.obj.read_stream_for_vocab( 45 | self.opt.chunk_lines, self.opt.num_threads) 46 | processed += read_lines 47 | pbar.update(processed, values=[("word_count", word_count)]) 48 | if processed == full_num_lines: 49 | break 50 | self.obj.get_word_vocab(min_count, keys_path, count_path) 51 | 52 | def convert_stream_to_h5(self, filepath, min_count, out_dir, 53 | chunk_indices=10000, seed=777): 54 | np.random.seed(seed) 55 | os.makedirs(out_dir, exist_ok=True) 56 | keys_path = pjoin(out_dir, "keys.txt") 57 | count_path = pjoin(out_dir, "count.txt") 58 | token_path = pjoin(out_dir, "token.h5") 59 | self.logger.info("save key, count, token to %s, %s, %s", 60 | keys_path, count_path, token_path) 61 | self.load_stream_vocab(filepath, min_count, keys_path, count_path) 62 | full_num_lines = self.obj.load_stream_file(filepath) 63 | pbar = aux.Progbar(full_num_lines, unit_name="line") 64 | processed = 0 65 | h5f = h5py.File(token_path, "w") 66 | rows = h5f.create_dataset("rows", shape=(chunk_indices,), 67 | maxshape=(None,), dtype=np.int64, 68 | chunks=(chunk_indices,)) 69 | cols = h5f.create_dataset("cols", shape=(chunk_indices,), 70 | maxshape=(None,), dtype=np.int32, 71 | chunks=(chunk_indices,)) 72 | vali = h5f.create_dataset("vali", shape=(chunk_indices,), 73 | maxshape=(None,), dtype=np.float32, 74 | chunks=(chunk_indices,)) 75 | indptr = h5f.create_dataset("indptr", shape=(full_num_lines + 1,), 76 | dtype=np.int64, chunks=True) 77 | processed, offset = 1, 0 78 | indptr[0] = 0 79 | while True: 80 | read_lines, data_size = self.obj.tokenize_stream( 81 | self.opt.chunk_lines, self.opt.num_threads) 82 | _rows = np.empty(shape=(data_size,), dtype=np.int32) 83 | _cols = np.empty(shape=(data_size,), dtype=np.int32) 84 | _indptr = np.empty(shape=(read_lines,), dtype=np.int32) 85 | self.obj.get_token(_rows, _cols, _indptr) 86 | rows.resize((offset + data_size,)) 87 | rows[offset:offset + data_size] = \ 88 | _rows.astype(np.int64) + (processed - 1) 89 | cols.resize((offset + data_size,)) 90 | cols[offset:offset + data_size] = _cols 91 | vali.resize((offset + data_size,)) 92 | vali[offset:offset + data_size] = \ 93 | np.random.uniform(size=(data_size,)).astype(np.float32) 94 | indptr[processed:processed + read_lines] = \ 95 | _indptr.astype(np.int64) + offset 96 | offset += data_size 97 | processed += read_lines 98 | pbar.update(processed - 1) 99 | if processed == full_num_lines + 1: 100 | break 101 | h5f.close() 102 | 103 | def convert_bow_to_h5(self, filepath, h5_path): 104 | self.logger.info("convert bow %s to h5 %s", filepath, h5_path) 105 | num_docs, num_words, num_lines = \ 106 | self.obj.read_bag_of_words_header(filepath) 107 | self.logger.info("number of docs: %d, words: %d, nnz: %d", 108 | num_docs, num_words, num_lines) 109 | h5f = h5py.File(h5_path, "w") 110 | rows = h5f.create_dataset("rows", dtype=np.int64, 111 | shape=(num_lines,), chunks=True) 112 | cols = h5f.create_dataset("cols", dtype=np.int32, 113 | shape=(num_lines,), chunks=True) 114 | counts = h5f.create_dataset("counts", dtype=np.float32, 115 | shape=(num_lines,), chunks=True) 116 | vali = h5f.create_dataset("vali", dtype=np.float32, 117 | shape=(num_lines,), chunks=True) 118 | indptr = h5f.create_dataset("indptr", dtype=np.int64, 119 | shape=(num_docs + 1,), chunks=True) 120 | indptr[0] = 0 121 | processed, recent_row, indptr_offset = 0, 0, 1 122 | pbar = aux.Progbar(num_lines, unit_name="line") 123 | while processed < num_lines: 124 | # get chunk size 125 | read_lines = min(num_lines - processed, self.opt.chunk_lines) 126 | 127 | # copy rows, cols, counts to h5 128 | _rows = np.empty((read_lines,), dtype=np.int64) 129 | _cols = np.empty((read_lines,), dtype=np.int32) 130 | _counts = np.empty((read_lines,), dtype=np.float32) 131 | self.obj.read_bag_of_words_content(_rows, _cols, _counts) 132 | rows[processed:processed + read_lines] = _rows 133 | cols[processed:processed + read_lines] = _cols 134 | counts[processed:processed + read_lines] = _counts 135 | vali[processed:processed + read_lines] = \ 136 | np.random.uniform(size=(read_lines,)).astype(np.float32) 137 | 138 | # compute indptr 139 | prev_rows = np.zeros((read_lines,), dtype=np.int64) 140 | prev_rows[1:] = _rows[:-1] 141 | prev_rows[0] = recent_row 142 | diff = _rows - prev_rows 143 | indices = np.where(diff > 0)[0] 144 | _indptr = [] 145 | for idx in indices: 146 | _indptr += ([processed + idx] * diff[idx]) 147 | if _indptr: 148 | indptr[indptr_offset:indptr_offset + len(_indptr)] = \ 149 | np.array(_indptr, dtype=np.int64) 150 | indptr_offset += len(_indptr) 151 | 152 | # udpate processed 153 | processed += read_lines 154 | pbar.update(processed) 155 | recent_row = _rows[-1] 156 | 157 | # finalize indptr 158 | _indptr = [num_lines] * (num_docs + 1 - indptr_offset) 159 | indptr[indptr_offset:num_docs + 1] = np.array(_indptr, dtype=np.int64) 160 | 161 | h5f.close() 162 | -------------------------------------------------------------------------------- /cusim/proto/config.proto: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021 Jisang Yoon 2 | // All rights reserved. 3 | // 4 | // This source code is licensed under the Apache 2.0 license found in the 5 | // LICENSE file in the root directory of this source tree. 6 | 7 | syntax = "proto2"; 8 | 9 | 10 | // option for data preprocessing 11 | message IoUtilsConfigProto { 12 | // logging levels in python and C++ 13 | optional int32 py_log_level = 1 [default = 2]; 14 | optional int32 c_log_level = 2 [default = 2]; 15 | 16 | // number of chunk lines to preprocess (txt => hdf5 format) data 17 | optional int32 chunk_lines = 3 [default = 100000]; 18 | 19 | // number of concurrent threads in data preprocessing 20 | optional int32 num_threads = 4 [default = 4]; 21 | 22 | // convert charater to lower case if true 23 | optional bool lower = 5 [default = true]; 24 | } 25 | 26 | 27 | // option for LDA model 28 | message CuLDAConfigProto { 29 | // logging levels in python and C++ 30 | optional int32 py_log_level = 1 [default = 2]; 31 | optional int32 c_log_level = 2 [default = 2]; 32 | 33 | // raw data path (format from https://archive.ics.uci.edu/ml/datasets/bag+of+words) 34 | optional string data_path = 7; 35 | 36 | // preprocessed data path (hdf5 format) 37 | // if empty, make temporary directory 38 | optional string processed_data_path = 6; 39 | 40 | // vocabulary path 41 | required string keys_path = 16; 42 | 43 | // skip preprocess (there should be already preprocessed hdf5 format) if true 44 | optional bool skip_preprocess = 8; 45 | 46 | // path to store gamma in E step 47 | // if empty, make temporary directory 48 | optional string gamma_path = 17; 49 | 50 | // reuse gamma from previous epoch if true 51 | // if false, initiate gamma as Figure 6 in https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf 52 | optional bool reuse_gamma = 18; 53 | 54 | // number of topics 55 | optional int32 num_topics = 3 [default = 10]; 56 | 57 | // block dimension in CUDA 58 | // should be multiple of WARP_SIZE (=32) 59 | optional int32 block_dim = 4 [default = 32]; 60 | 61 | // set the number blocks as num_blocks * block_dim = physical_cores_in_GPU * hyper_threads 62 | optional int32 hyper_threads = 5 [default = 100]; 63 | 64 | // batch size in training 65 | optional int32 batch_size = 10 [default = 1000000]; 66 | 67 | // number of epochs in training 68 | optional int32 epochs = 11 [default = 10]; 69 | 70 | // number of iterations in each E step 71 | optional int32 num_iters_in_e_step = 12 [default = 5]; 72 | 73 | // validation ratio, should be between 0 and 1 74 | optional double vali_p = 13 [default = 0.2]; 75 | 76 | // random seed 77 | optional int32 seed = 14 [default = 777]; 78 | 79 | // remove all tempory directorys generated by package when program finnished if true 80 | optional bool remove_tmp = 19 [default = true]; 81 | 82 | optional IoUtilsConfigProto io = 15; 83 | } 84 | 85 | // options for loading pretrained w2v model 86 | // can load w2v model file generated by gensim or original w2v code by Google 87 | message W2VPretrainedModel { 88 | optional string filename = 1; 89 | optional bool no_header = 2; 90 | optional bool binary = 3; 91 | optional bool symmetry = 4; 92 | } 93 | 94 | 95 | // option for training Word2Vec model 96 | message CuW2VConfigProto { 97 | // logging levels in python and C++ 98 | optional int32 py_log_level = 1 [default = 2]; 99 | optional int32 c_log_level = 2 [default = 2]; 100 | 101 | // raw data path (stream txt format) 102 | optional string data_path = 7; 103 | 104 | // path to save preprocessed data (hdf5 format) 105 | optional string processed_data_dir = 6; 106 | 107 | // skip data preprocessing (therefore, there should be 108 | // already preprocessed hdf5 format file) if true 109 | optional bool skip_preprocess = 8; 110 | 111 | // number of embedding dimensions 112 | optional int32 num_dims = 3 [default = 50]; 113 | 114 | // block_dim in CUDA 115 | optional int32 block_dim = 4 [default = 32]; 116 | 117 | // set number of blocks as num_blocks * block_dim = physical_cores_in_GPU * hyper_threads 118 | optional int32 hyper_threads = 5 [default = 100]; 119 | 120 | // generate vocabulary with words appreared in corpus at least word_min_count times 121 | optional int32 word_min_count = 9 [default = 5]; 122 | 123 | // batch size and number of epochs in training 124 | optional int32 batch_size = 10 [default = 1000000]; 125 | optional int32 epochs = 11 [default = 10]; 126 | 127 | // seed fields 128 | optional int32 seed = 14 [default = 777]; 129 | 130 | // random table size in negative sampling 131 | optional int32 random_size = 12 [default = 100000000]; 132 | 133 | // number of negative samples 134 | // if zero, it uses hierarchical softmax 135 | optional int32 neg = 17 [default = 10]; 136 | 137 | // weight in negative sampling will be word_count ** count_power for each word 138 | // default value 0.75 is recommended in w2v paper 139 | optional double count_power = 18 [default = 0.75]; 140 | 141 | // if true, train skip gram model, else train cbow model 142 | optional bool skip_gram = 19 [default = true]; 143 | 144 | // if true, use average context vector in cbow model 145 | // else use summation of context vectors 146 | optional bool cbow_mean = 20 [default = true]; 147 | 148 | // learning rate 149 | optional double lr = 21 [default = 0.001]; 150 | 151 | // window size in both skip gram and cbow model 152 | optional int32 window_size = 22 [default = 5]; 153 | 154 | // remove all tempory directorys generated by package when program finnished if true 155 | optional bool remove_tmp = 26 [default = true]; 156 | 157 | optional IoUtilsConfigProto io = 24; 158 | optional W2VPretrainedModel pretrained_model = 25; 159 | } 160 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = cusim 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=invalid-name,unused-import,redefined-builtin 2 | 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file only contains a selection of the most common options. For a full 6 | # list see the documentation: 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # import sys 17 | # sys.path.insert(0, os.path.abspath('.')) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | import sphinx_rtd_theme 22 | 23 | project = 'cusim' 24 | copyright = '2021, Jisang Yoon' 25 | author = 'Jisang Yoon' 26 | 27 | # The full version, including alpha/beta/rc tags 28 | release = "0.0.1" 29 | 30 | 31 | # -- General configuration --------------------------------------------------- 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35 | # ones. 36 | extensions = [ 37 | "sphinx_rtd_theme", 38 | "sphinx.ext.autodoc", 39 | "sphinx.ext.napoleon" 40 | ] 41 | 42 | # Add any paths that contain templates here, relative to this directory. 43 | templates_path = ['_templates'] 44 | 45 | # List of patterns, relative to source directory, that match files and 46 | # directories to ignore when looking for source files. 47 | # This patterns also effect to html_static_path and html_extra_path 48 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 49 | 50 | # The name of the Pygments (syntax highlighting) style to use. 51 | pygments_style = "sphinx" 52 | 53 | # If true, `todo` and `todoList` produce output, else they produce nothing. 54 | todo_include_todos = False 55 | 56 | # -- Options for HTML output ------------------------------------------------- 57 | 58 | # The theme to use for HTML and HTML Help pages. See the documentation for 59 | # a list of builtin themes. 60 | # 61 | html_theme = "sphinx_rtd_theme" 62 | 63 | # Add any paths that contain custom static files (such as style sheets) here, 64 | # relative to this directory. They are copied after the builtin static files, 65 | # so a file named "default.css" will overwrite the builtin "default.css". 66 | html_static_path = ['_static'] 67 | 68 | html_sidebars = { 69 | "**": [ 70 | "about.html", 71 | "navigation.html", 72 | "relations.html", # needs 'show_related': True theme option to display 73 | "searchbox.html", 74 | "donate.html", 75 | ] 76 | } 77 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. cusim documentation master file, created by 2 | sphinx-quickstart on Sat Feb 20 13:36:31 2021. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | CUSIM - Superfast implementation of Word2Vec and LDA 7 | ==================================================== 8 | 9 | 10 | CUSIM is a project to speed up various ML models (e.g. topic modeling, word embedding, etc) by CUDA. It would be nice to think of it as `gensim `_'s GPU version project. As a starting step, I implemented the most widely used word embedding model, the `word2vec `_ model, and the most representative topic model, the `LDA (Latent Dirichlet Allocation) `_ model. 11 | 12 | 13 | 14 | .. toctree:: 15 | :maxdepth: 2 16 | :caption: Contents 17 | 18 | Installation 19 | Word2Vec 20 | LDA 21 | 22 | 23 | Indices and tables 24 | ================== 25 | 26 | * :ref:`genindex` 27 | * :ref:`modindex` 28 | * :ref:`search` 29 | -------------------------------------------------------------------------------- /docs/install.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | Install from pypi 5 | ----------------- 6 | 7 | .. code-block:: shell 8 | 9 | pip install cusim 10 | 11 | 12 | Install from source 13 | -------------------- 14 | 15 | .. code-block:: shell 16 | 17 | # clone repo and submodules 18 | git clone git@github.com:js1010/cusim.git && cd cusim && git submodule update --init 19 | 20 | # install requirements 21 | pip install -r requirements.txt 22 | 23 | # generate proto 24 | python -m grpc_tools.protoc --python_out cusim/ --proto_path cusim/proto/ config.proto 25 | 26 | # install 27 | python setup.py install 28 | -------------------------------------------------------------------------------- /docs/lda.rst: -------------------------------------------------------------------------------- 1 | LDA 2 | === 3 | 4 | 5 | Parameters 6 | ---------- 7 | 8 | 9 | - See `CuLDAConfigProto `_ 10 | 11 | 12 | Example Codes 13 | ------------- 14 | 15 | - Full source code is in `examples/example_lda.py `_ 16 | 17 | - Before running example codes, run 18 | 19 | .. code-block:: shell 20 | 21 | pip install -r examples/requirements.txt 22 | 23 | 24 | - Download and preprocess data 25 | 26 | .. code-block:: python 27 | 28 | import os 29 | from os.path import join as pjoin 30 | import subprocess 31 | 32 | import wget 33 | 34 | DATASET = "nytimes" 35 | DIR_PATH = "./res" 36 | BASE_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/" \ 37 | "bag-of-words/" 38 | 39 | 40 | # download docword 41 | filename = f"docword.{DATASET}.txt.gz" 42 | out_path = pjoin(DIR_PATH, filename) 43 | wget.download(BASE_URL + filename, out=out_path) 44 | print() 45 | 46 | # decompress 47 | cmd = ["gunzip", "-c", out_path, ">", 48 | pjoin(DIR_PATH, f"docword.{DATASET}.txt")] 49 | cmd = " ".join(cmd) 50 | subprocess.call(cmd, shell=True) 51 | os.remove(pjoin(DIR_PATH, filename)) 52 | 53 | # download vocab 54 | filename = f"vocab.{DATASET}.txt" 55 | out_path = pjoin(DIR_PATH, filename) 56 | wget.download(BASE_URL + filename, out=out_path) 57 | print() 58 | 59 | - Train cusim word2vec 60 | 61 | .. code-block:: python 62 | 63 | from cusim import CuLDA 64 | 65 | data_path = pjoin(DIR_PATH, f"docword.{DATASET}.txt") 66 | keys_path = pjoin(DIR_PATH, f"vocab.{DATASET}.txt") 67 | processed_data_path = pjoin(DIR_PATH, f"docword.{DATASET}.h5") 68 | opt = { 69 | "data_path": data_path, 70 | "processed_data_path": processed_data_path, 71 | "keys_path": keys_path, 72 | "num_topics": 50, 73 | "num_iters_in_e_step": 10, 74 | "reuse_gamma": True, 75 | # "skip_preprocess": os.path.exists(processed_data_path), 76 | } 77 | start = time.time() 78 | lda = CuLDA(opt) 79 | lda.train_model() 80 | 81 | 82 | - Save and evaluate model 83 | 84 | .. code-block:: python 85 | 86 | h5_model_path = pjoin(DIR_PATH, "cusim.lda.model.h5") 87 | lda.save_h5_model(h5_model_path) 88 | 89 | h5f = h5py.File(h5_model_path, "r") 90 | beta = h5f["beta"][:, :].T 91 | keys = h5f["keys"][:] 92 | topk = 10 93 | 94 | for idx in range(beta.shape[0]): 95 | print("=" * 50) 96 | print(f"topic {idx + 1}") 97 | print("-" * 50) 98 | _beta = beta[idx, :] 99 | indices = np.argsort(-_beta)[:topk] 100 | for rank, wordid in enumerate(indices): 101 | word = keys[wordid].decode("utf8") 102 | prob = _beta[wordid] 103 | print(f"rank {rank + 1}. {word}: {prob}") 104 | 105 | 106 | Performance 107 | ----------- 108 | 109 | - Data: `nytimes dataset `_ 110 | - Topic Results 111 | - `cusim lda results `_ 112 | - `gensim lda results `_ 113 | - Time Performance 114 | - Experimented in `AWS g4dn 2xlarge `_ (One NVIDIA T4 and 8 vcpus of 8 Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz) 115 | 116 | +---------------------+-------------------+--------------------+ 117 | | attr | gensim (8 vpus) | cusim (NVIDIA T4)| 118 | +=====================+===================+====================+ 119 | | training time (sec) | 447.376 | **76.6972** | 120 | +---------------------+-------------------+--------------------+ 121 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=python -msphinx 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | set SPHINXPROJ=cusim 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The Sphinx module was not found. Make sure you have Sphinx installed, 20 | echo.then set the SPHINXBUILD environment variable to point to the full 21 | echo.path of the 'sphinx-build' executable. Alternatively you may add the 22 | echo.Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/w2v.rst: -------------------------------------------------------------------------------- 1 | Word2Vec 2 | ======== 3 | 4 | 5 | Parameters 6 | ---------- 7 | 8 | 9 | - See `CuW2VConfigProto `_ 10 | 11 | 12 | Example Codes 13 | ------------- 14 | 15 | - Full source code is in `examples/example_w2v.py `_ 16 | 17 | - Before running example codes, run 18 | 19 | .. code-block:: shell 20 | 21 | pip install -r examples/requirements.txt 22 | 23 | 24 | - Download and preprocess data 25 | 26 | .. code-block:: python 27 | 28 | import os 29 | import subprocess 30 | 31 | import nltk 32 | from nltk.tokenize import RegexpTokenizer 33 | 34 | DOWNLOAD_PATH = "./res" 35 | DATASET = "quora-duplicate-questions" 36 | DATA_PATH = f"./res/{DATASET}.stream.txt" 37 | PROCESSED_DATA_DIR = f"./res/{DATASET}-processed" 38 | 39 | def preprocess_line(line, tokenizer, lemmatizer): 40 | line = line.lower() 41 | line = tokenizer.tokenize(line) 42 | line = [token for token in line 43 | if not token.isnumeric() and len(token) > 1] 44 | line = [lemmatizer.lemmatize(token) for token in line] 45 | return " ".join(line) 46 | 47 | # download 48 | api.BASE_DIR = DOWNLOAD_PATH 49 | filepath = api.load(DATASET, return_path=True) 50 | cmd = ["gunzip", "-c", filepath, ">", DATA_PATH] 51 | cmd = " ".join(cmd) 52 | subprocess.call(cmd, shell=True) 53 | 54 | # preprocess data 55 | tokenizer = RegexpTokenizer(r'\w+') 56 | nltk.download("wordnet") 57 | lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() 58 | fout = open(DATA_PATH + ".tmp", "wb") 59 | with open(DATA_PATH, "rb") as fin: 60 | for line in tqdm.tqdm(fin): 61 | line = line.decode("utf8").strip() 62 | line = preprocess_line(line, tokenizer, lemmatizer) 63 | fout.write((line + "\n").encode("utf8")) 64 | fout.close() 65 | os.rename(DATA_PATH + ".tmp", DATA_PATH) 66 | 67 | - Train cusim word2vec 68 | 69 | .. code-block:: python 70 | 71 | from cusim import CuW2V 72 | 73 | MIN_COUNT = 5 74 | LEARNING_RATE = 0.001 75 | NEG_SIZE = 10 76 | NUM_DIMS = 100 77 | CBOW_MEAN = False 78 | EPOCHS = 10 79 | 80 | opt = { 81 | "data_path": DATA_PATH, 82 | "processed_data_dir": PROCESSED_DATA_DIR, 83 | # "skip_preprocess": os.path.exists(PROCESSED_DATA_DIR), 84 | "num_dims": NUM_DIMS, 85 | "epochs": EPOCHS, 86 | "word_min_count": MIN_COUNT, 87 | "lr": 0.001, 88 | "io": { 89 | "lower": False 90 | }, 91 | "neg": 0 if hierarchical_softmax else NEG_SIZE, 92 | "skip_gram": skip_gram, 93 | "cbow_mean": CBOW_MEAN, 94 | } 95 | w2v = CuW2V(opt) 96 | w2v.train_model() 97 | 98 | 99 | - Save and evaluate model 100 | 101 | .. code-block:: python 102 | 103 | import gensim 104 | from gensim.test.utils import datapath 105 | 106 | CUSIM_MODEL = "./res/cusim.w2v.model" 107 | 108 | w2v.save_word2vec_format(CUSIM_MODEL, binary=False) 109 | model = gensim.models.KeyedVectors.load_word2vec_format(model) 110 | results = model.wv.evaluate_word_pairs(datapath("wordsim353.tsv"), 111 | case_insensitive=False) 112 | 113 | Performance 114 | ----------- 115 | 116 | - Data: quora-duplicate-questions from `gensim downloader api `_ 117 | - skip gram, hierarchical softmax 118 | - Experimented in `AWS g4dn 2xlarge `_ (One NVIDIA T4 and 8 vcpus of 8 Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz) 119 | 120 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+ 121 | | attr | 1 workers (gensim) | 2 workers (gensim) | 4 workers (gensim) | 8 workers (gensim) | NVIDIA T4 (cusim) | 122 | +=====================+======================+======================+======================+======================+=====================+ 123 | | training time (sec) | 892.596 | 544.212 | 310.727 | 226.472 | **16.162** | 124 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+ 125 | | pearson | 0.487832 | 0.487696 | 0.482821 | 0.487136 | **0.492101** | 126 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+ 127 | | spearman | 0.500846 | 0.506214 | 0.501048 | **0.506718** | 0.479468 | 128 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+ 129 | 130 | - skip gram, negative sampling 131 | 132 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+ 133 | | attr | 1 workers (gensim) | 2 workers (gensim) | 4 workers (gensim) | 8 workers (gensim) | NVIDIA T4 (cusim) | 134 | +=====================+======================+======================+======================+======================+=====================+ 135 | | training time (sec) | 586.545 | 340.489 | 220.804 | 146.23 | **33.9173** | 136 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+ 137 | | pearson | 0.354448 | 0.353952 | 0.352398 | 0.352925 | **0.360436** | 138 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+ 139 | | spearman | 0.369146 | 0.369365 | **0.370565** | 0.365822 | 0.355204 | 140 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+ 141 | 142 | - CBOW, hierarchical softmax 143 | 144 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+ 145 | | attr | 1 workers (gensim) | 2 workers (gensim) | 4 workers (gensim) | 8 workers (gensim) | NVIDIA T4 (cusim) | 146 | +=====================+======================+======================+======================+======================+=====================+ 147 | | training time (sec) | 250.135 | 155.121 | 103.57 | 73.8073 | **6.20787** | 148 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+ 149 | | pearson | 0.309651 | 0.321803 | 0.324854 | 0.314255 | **0.480298** | 150 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+ 151 | | spearman | 0.294047 | 0.308723 | 0.318293 | 0.300591 | **0.480971** | 152 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+ 153 | 154 | - CBOW, negative sampling 155 | 156 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+ 157 | | attr | 1 workers (gensim) | 2 workers (gensim) | 4 workers (gensim) | 8 workers (gensim) | NVIDIA T4 (cusim) | 158 | +=====================+======================+======================+======================+======================+=====================+ 159 | | training time (sec) | 176.923 | 100.369 | 69.7829 | 49.9274 | **9.90391** | 160 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+ 161 | | pearson | 0.18772 | 0.193152 | 0.204509 | 0.187924 | **0.368202** | 162 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+ 163 | | spearman | 0.243975 | 0.24587 | 0.260531 | 0.237441 | **0.358042** | 164 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+ 165 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | ### How to run example code 2 | 3 | 0. install requirements 4 | 5 | ```shell 6 | pip install -r requirements.txt 7 | ``` 8 | 9 | 1. first, it is good to know about python-fire in https://github.com/google/python-fire, if you haven't heard yet. 10 | 11 | 2. run w2v experiments on various setting (e.g. skip gram with hierarchical softmax) 12 | 13 | ```shell 14 | python example_w2v.py run_experiments --sg0=True --hs0=True 15 | ``` 16 | 17 | 3. run lda experiments 18 | 19 | ```shell 20 | python example_lda.py run_experiments 21 | ``` 22 | -------------------------------------------------------------------------------- /examples/cusim.topics.txt: -------------------------------------------------------------------------------- 1 | ================================================== 2 | topic 1 3 | -------------------------------------------------- 4 | rank 1. car: 0.02677285298705101 5 | rank 2. vehicle: 0.006062767934054136 6 | rank 3. wheel: 0.005854051560163498 7 | rank 4. door: 0.0056894212029874325 8 | rank 5. vehicles: 0.005506897810846567 9 | rank 6. model: 0.005505426321178675 10 | rank 7. seat: 0.00544615276157856 11 | rank 8. zzz_ford: 0.004928195849061012 12 | rank 9. truck: 0.00481862248852849 13 | rank 10. front: 0.004714458715170622 14 | ================================================== 15 | topic 2 16 | -------------------------------------------------- 17 | rank 1. priest: 0.020068475976586342 18 | rank 2. church: 0.018575558438897133 19 | rank 3. abuse: 0.014300045557320118 20 | rank 4. sexual: 0.012478752993047237 21 | rank 5. information: 0.011768681928515434 22 | rank 6. bishop: 0.010295535437762737 23 | rank 7. privacy: 0.00979470182210207 24 | rank 8. enditalic: 0.007282644044607878 25 | rank 9. zzz_government: 0.007169242016971111 26 | rank 10. beginitalic: 0.007022677455097437 27 | ================================================== 28 | topic 3 29 | -------------------------------------------------- 30 | rank 1. scientist: 0.012924057431519032 31 | rank 2. plant: 0.010201558470726013 32 | rank 3. animal: 0.00955168902873993 33 | rank 4. human: 0.006574922241270542 34 | rank 5. water: 0.006187545135617256 35 | rank 6. species: 0.005247119814157486 36 | rank 7. science: 0.003869544016197324 37 | rank 8. research: 0.0037548812106251717 38 | rank 9. chemical: 0.0036675187293440104 39 | rank 10. researcher: 0.003376629902049899 40 | ================================================== 41 | topic 4 42 | -------------------------------------------------- 43 | rank 1. room: 0.0097695617005229 44 | rank 2. building: 0.009012440219521523 45 | rank 3. hotel: 0.007701032795011997 46 | rank 4. town: 0.007017012685537338 47 | rank 5. visitor: 0.005999790038913488 48 | rank 6. park: 0.004900350235402584 49 | rank 7. water: 0.00483303889632225 50 | rank 8. restaurant: 0.004804808646440506 51 | rank 9. tour: 0.004689469002187252 52 | rank 10. house: 0.004657984711229801 53 | ================================================== 54 | topic 5 55 | -------------------------------------------------- 56 | rank 1. executive: 0.010459953919053078 57 | rank 2. president: 0.009247427806258202 58 | rank 3. chief: 0.007263457868248224 59 | rank 4. deal: 0.006793353706598282 60 | rank 5. media: 0.006781542673707008 61 | rank 6. zzz_u_s: 0.006448546424508095 62 | rank 7. question: 0.006310692522674799 63 | rank 8. public: 0.0058334809727966785 64 | rank 9. client: 0.0057419463992118835 65 | rank 10. com: 0.005699603818356991 66 | ================================================== 67 | topic 6 68 | -------------------------------------------------- 69 | rank 1. official: 0.010707475244998932 70 | rank 2. zzz_new_york: 0.010631673038005829 71 | rank 3. building: 0.006501882337033749 72 | rank 4. found: 0.005982889328151941 73 | rank 5. worker: 0.005903987213969231 74 | rank 6. officer: 0.00562720000743866 75 | rank 7. hour: 0.005579715128988028 76 | rank 8. security: 0.0047438195906579494 77 | rank 9. plane: 0.004530096426606178 78 | rank 10. attack: 0.0045171682722866535 79 | ================================================== 80 | topic 7 81 | -------------------------------------------------- 82 | rank 1. gold: 0.008730198256671429 83 | rank 2. hour: 0.00799502432346344 84 | rank 3. floor: 0.007750170771032572 85 | rank 4. medal: 0.005479565821588039 86 | rank 5. rider: 0.005427593365311623 87 | rank 6. ice: 0.005187307950109243 88 | rank 7. event: 0.004164813086390495 89 | rank 8. silver: 0.00394535344094038 90 | rank 9. hand: 0.003944935742765665 91 | rank 10. moment: 0.003745123278349638 92 | ================================================== 93 | topic 8 94 | -------------------------------------------------- 95 | rank 1. customer: 0.020594391971826553 96 | rank 2. product: 0.01662433333694935 97 | rank 3. weather: 0.010188293643295765 98 | rank 4. stores: 0.009588934481143951 99 | rank 5. marketing: 0.007573566399514675 100 | rank 6. consumer: 0.007247460074722767 101 | rank 7. need: 0.00708211213350296 102 | rank 8. business: 0.006656122859567404 103 | rank 9. problem: 0.006193865556269884 104 | rank 10. sales: 0.00576401362195611 105 | ================================================== 106 | topic 9 107 | -------------------------------------------------- 108 | rank 1. zzz_enron: 0.0346914604306221 109 | rank 2. anthrax: 0.018805652856826782 110 | rank 3. firm: 0.017304280772805214 111 | rank 4. employees: 0.013712462969124317 112 | rank 5. accounting: 0.011462894268333912 113 | rank 6. company: 0.010516936890780926 114 | rank 7. letter: 0.009165323339402676 115 | rank 8. zzz_arthur_andersen: 0.008399050682783127 116 | rank 9. financial: 0.006972334813326597 117 | rank 10. official: 0.006733026821166277 118 | ================================================== 119 | topic 10 120 | -------------------------------------------------- 121 | rank 1. game: 0.02054956741631031 122 | rank 2. yard: 0.01949656568467617 123 | rank 3. season: 0.018450269475579262 124 | rank 4. play: 0.01595749706029892 125 | rank 5. team: 0.014850640669465065 126 | rank 6. coach: 0.012072306126356125 127 | rank 7. football: 0.010657819919288158 128 | rank 8. player: 0.010432523675262928 129 | rank 9. zzz_nfl: 0.009206585586071014 130 | rank 10. defensive: 0.008976943790912628 131 | ================================================== 132 | topic 11 133 | -------------------------------------------------- 134 | rank 1. con: 0.020736297592520714 135 | rank 2. una: 0.013567320071160793 136 | rank 3. las: 0.01041751354932785 137 | rank 4. mas: 0.010156860575079918 138 | rank 5. dice: 0.009438637644052505 139 | rank 6. por: 0.00928747933357954 140 | rank 7. como: 0.008855272084474564 141 | rank 8. los: 0.008734958246350288 142 | rank 9. zzz_argentina: 0.0077548702247440815 143 | rank 10. anos: 0.0052759042009711266 144 | ================================================== 145 | topic 12 146 | -------------------------------------------------- 147 | rank 1. zzz_afghanistan: 0.02263963408768177 148 | rank 2. zzz_taliban: 0.019689183682203293 149 | rank 3. military: 0.014852428808808327 150 | rank 4. bin: 0.014605461619794369 151 | rank 5. laden: 0.014458988793194294 152 | rank 6. war: 0.01199477817863226 153 | rank 7. zzz_pakistan: 0.01184108667075634 154 | rank 8. terrorist: 0.011557201854884624 155 | rank 9. zzz_u_s: 0.01051971036940813 156 | rank 10. attack: 0.009562982246279716 157 | ================================================== 158 | topic 13 159 | -------------------------------------------------- 160 | rank 1. court: 0.02521500550210476 161 | rank 2. case: 0.023994332179427147 162 | rank 3. lawyer: 0.019229630008339882 163 | rank 4. trial: 0.012606462463736534 164 | rank 5. attorney: 0.011963741853833199 165 | rank 6. law: 0.010776755400002003 166 | rank 7. prosecutor: 0.010139403864741325 167 | rank 8. judge: 0.010069739073514938 168 | rank 9. federal: 0.01000827457755804 169 | rank 10. charges: 0.009131026454269886 170 | ================================================== 171 | topic 14 172 | -------------------------------------------------- 173 | rank 1. children: 0.024856505915522575 174 | rank 2. family: 0.023518990725278854 175 | rank 3. mother: 0.021585773676633835 176 | rank 4. parent: 0.018566781654953957 177 | rank 5. father: 0.017965450882911682 178 | rank 6. child: 0.016640648245811462 179 | rank 7. son: 0.014798246324062347 180 | rank 8. boy: 0.013485484756529331 181 | rank 9. girl: 0.01209142617881298 182 | rank 10. daughter: 0.011482957750558853 183 | ================================================== 184 | topic 15 185 | -------------------------------------------------- 186 | rank 1. home: 0.00700838677585125 187 | rank 2. run: 0.006053796038031578 188 | rank 3. right: 0.005981859751045704 189 | rank 4. left: 0.005203519947826862 190 | rank 5. part: 0.005086812656372786 191 | rank 6. night: 0.004532037302851677 192 | rank 7. put: 0.004220300819724798 193 | rank 8. took: 0.003913923632353544 194 | rank 9. called: 0.003663261653855443 195 | rank 10. early: 0.0034683081321418285 196 | ================================================== 197 | topic 16 198 | -------------------------------------------------- 199 | rank 1. computer: 0.05519622564315796 200 | rank 2. system: 0.038603898137807846 201 | rank 3. zzz_microsoft: 0.0243679191917181 202 | rank 4. software: 0.02125958725810051 203 | rank 5. technology: 0.016031846404075623 204 | rank 6. window: 0.015655480325222015 205 | rank 7. mail: 0.01430702954530716 206 | rank 8. user: 0.011626251973211765 207 | rank 9. information: 0.010091814212501049 208 | rank 10. operating: 0.00756523571908474 209 | ================================================== 210 | topic 17 211 | -------------------------------------------------- 212 | rank 1. law: 0.02723626047372818 213 | rank 2. right: 0.013789551332592964 214 | rank 3. political: 0.012496591545641422 215 | rank 4. government: 0.012413491494953632 216 | rank 5. religious: 0.01058514229953289 217 | rank 6. immigrant: 0.010227411054074764 218 | rank 7. power: 0.008888700045645237 219 | rank 8. ruling: 0.006956734228879213 220 | rank 9. court: 0.006303566973656416 221 | rank 10. opposition: 0.006150286644697189 222 | ================================================== 223 | topic 18 224 | -------------------------------------------------- 225 | rank 1. driver: 0.026438318192958832 226 | rank 2. car: 0.021013904362916946 227 | rank 3. race: 0.02072136662900448 228 | rank 4. racing: 0.013081525452435017 229 | rank 5. airline: 0.012061871588230133 230 | rank 6. flight: 0.009761194698512554 231 | rank 7. track: 0.008779071271419525 232 | rank 8. races: 0.007440405432134867 233 | rank 9. airlines: 0.00735550606623292 234 | rank 10. carrier: 0.0064879427663981915 235 | ================================================== 236 | topic 19 237 | -------------------------------------------------- 238 | rank 1. zzz_bush: 0.018247155472636223 239 | rank 2. official: 0.015417532064020634 240 | rank 3. zzz_united_states: 0.01530768908560276 241 | rank 4. administration: 0.013708231039345264 242 | rank 5. leader: 0.010346359573304653 243 | rank 6. countries: 0.009353779256343842 244 | rank 7. zzz_u_s: 0.009245852008461952 245 | rank 8. government: 0.009168907068669796 246 | rank 9. zzz_iraq: 0.009057393297553062 247 | rank 10. military: 0.008723369799554348 248 | ================================================== 249 | topic 20 250 | -------------------------------------------------- 251 | rank 1. percent: 0.044513553380966187 252 | rank 2. stock: 0.023978371173143387 253 | rank 3. market: 0.022495266050100327 254 | rank 4. fund: 0.013825907371938229 255 | rank 5. billion: 0.012179437093436718 256 | rank 6. quarter: 0.010966183617711067 257 | rank 7. investor: 0.01015525683760643 258 | rank 8. investment: 0.009771433658897877 259 | rank 9. million: 0.009703823365271091 260 | rank 10. analyst: 0.00947034452110529 261 | ================================================== 262 | topic 21 263 | -------------------------------------------------- 264 | rank 1. book: 0.02586548589169979 265 | rank 2. art: 0.009416724555194378 266 | rank 3. artist: 0.007856832817196846 267 | rank 4. collection: 0.007611869368702173 268 | rank 5. painting: 0.0066984654404222965 269 | rank 6. fashion: 0.005222611129283905 270 | rank 7. century: 0.005118153523653746 271 | rank 8. writer: 0.004741827957332134 272 | rank 9. designer: 0.004720605909824371 273 | rank 10. author: 0.004426541272550821 274 | ================================================== 275 | topic 22 276 | -------------------------------------------------- 277 | rank 1. music: 0.03731286898255348 278 | rank 2. song: 0.023323602974414825 279 | rank 3. cell: 0.015249419026076794 280 | rank 4. album: 0.011770840734243393 281 | rank 5. band: 0.011705778539180756 282 | rank 6. musical: 0.008127620443701744 283 | rank 7. singer: 0.006815788336098194 284 | rank 8. concert: 0.006784858647733927 285 | rank 9. jazz: 0.006698825396597385 286 | rank 10. sound: 0.006471691187471151 287 | ================================================== 288 | topic 23 289 | -------------------------------------------------- 290 | rank 1. web: 0.04922621324658394 291 | rank 2. site: 0.03805321082472801 292 | rank 3. www: 0.03708707541227341 293 | rank 4. com: 0.03255585581064224 294 | rank 5. online: 0.027454305440187454 295 | rank 6. zzz_internet: 0.019746430218219757 296 | rank 7. sites: 0.018789643421769142 297 | rank 8. information: 0.012109276838600636 298 | rank 9. mail: 0.010703440755605698 299 | rank 10. internet: 0.010465497151017189 300 | ================================================== 301 | topic 24 302 | -------------------------------------------------- 303 | rank 1. cup: 0.013092475943267345 304 | rank 2. food: 0.011349334381520748 305 | rank 3. minutes: 0.008257454261183739 306 | rank 4. add: 0.007631846237927675 307 | rank 5. tablespoon: 0.006674299016594887 308 | rank 6. oil: 0.006410549394786358 309 | rank 7. pepper: 0.005671842489391565 310 | rank 8. sugar: 0.005601006560027599 311 | rank 9. teaspoon: 0.005426750052720308 312 | rank 10. water: 0.005266525782644749 313 | ================================================== 314 | topic 25 315 | -------------------------------------------------- 316 | rank 1. team: 0.03806942701339722 317 | rank 2. season: 0.0169094055891037 318 | rank 3. games: 0.014860374853014946 319 | rank 4. zzz_olympic: 0.013387414626777172 320 | rank 5. coach: 0.011522733606398106 321 | rank 6. zzz_miami: 0.00958396214991808 322 | rank 7. athletes: 0.009186827577650547 323 | rank 8. player: 0.00904573779553175 324 | rank 9. football: 0.008782983757555485 325 | rank 10. defense: 0.007022276986390352 326 | ================================================== 327 | topic 26 328 | -------------------------------------------------- 329 | rank 1. zzz_united_states: 0.00901876762509346 330 | rank 2. zzz_american: 0.008501513861119747 331 | rank 3. american: 0.008129569701850414 332 | rank 4. country: 0.006140291225165129 333 | rank 5. government: 0.005337539594620466 334 | rank 6. group: 0.005324787925928831 335 | rank 7. german: 0.0052407230250537395 336 | rank 8. history: 0.004972025752067566 337 | rank 9. french: 0.0047142705880105495 338 | rank 10. family: 0.004625052213668823 339 | ================================================== 340 | topic 27 341 | -------------------------------------------------- 342 | rank 1. women: 0.07863874733448029 343 | rank 2. gay: 0.020628171041607857 344 | rank 3. dog: 0.014878431335091591 345 | rank 4. magazine: 0.01347420085221529 346 | rank 5. woman: 0.012085708789527416 347 | rank 6. sex: 0.009264894761145115 348 | rank 7. female: 0.008259394206106663 349 | rank 8. cat: 0.006200404372066259 350 | rank 9. male: 0.0057057044468820095 351 | rank 10. lesbian: 0.0040387725457549095 352 | ================================================== 353 | topic 28 354 | -------------------------------------------------- 355 | rank 1. digital: 0.011757075786590576 356 | rank 2. screen: 0.0080463457852602 357 | rank 3. wine: 0.007102092728018761 358 | rank 4. device: 0.006819858215749264 359 | rank 5. wines: 0.0068092974834144115 360 | rank 6. chip: 0.006679498124867678 361 | rank 7. computer: 0.006480266340076923 362 | rank 8. devices: 0.005909178406000137 363 | rank 9. electronic: 0.0056115672923624516 364 | rank 10. images: 0.004710317123681307 365 | ================================================== 366 | topic 29 367 | -------------------------------------------------- 368 | rank 1. campaign: 0.03327873349189758 369 | rank 2. political: 0.014918365515768528 370 | rank 3. democratic: 0.014790846966207027 371 | rank 4. election: 0.014583878219127655 372 | rank 5. republican: 0.014538025483489037 373 | rank 6. voter: 0.01402147114276886 374 | rank 7. zzz_al_gore: 0.013029148802161217 375 | rank 8. zzz_party: 0.012214157730340958 376 | rank 9. zzz_republican: 0.011119640432298183 377 | rank 10. candidates: 0.010824044235050678 378 | ================================================== 379 | topic 30 380 | -------------------------------------------------- 381 | rank 1. school: 0.03626062348484993 382 | rank 2. student: 0.021992284804582596 383 | rank 3. black: 0.015230956487357616 384 | rank 4. group: 0.013538197614252567 385 | rank 5. public: 0.010991621762514114 386 | rank 6. percent: 0.010974901728332043 387 | rank 7. zzz_texas: 0.008697726763784885 388 | rank 8. gun: 0.007661579176783562 389 | rank 9. member: 0.0075561245903372765 390 | rank 10. white: 0.007528342306613922 391 | ================================================== 392 | topic 31 393 | -------------------------------------------------- 394 | rank 1. zzz_fbi: 0.025642145425081253 395 | rank 2. fish: 0.020048771053552628 396 | rank 3. bird: 0.013764469884335995 397 | rank 4. agent: 0.011454230174422264 398 | rank 5. irish: 0.009724821895360947 399 | rank 6. fishing: 0.00831819698214531 400 | rank 7. zzz_timothy_mcveigh: 0.006179510150104761 401 | rank 8. zzz_brazil: 0.006174848414957523 402 | rank 9. hijacker: 0.0060051921755075455 403 | rank 10. zzz_simon: 0.005628513637930155 404 | ================================================== 405 | topic 32 406 | -------------------------------------------------- 407 | rank 1. company: 0.07715368270874023 408 | rank 2. companies: 0.033467356115579605 409 | rank 3. business: 0.019932780414819717 410 | rank 4. million: 0.01110815443098545 411 | rank 5. deal: 0.01099175214767456 412 | rank 6. executives: 0.010963932611048222 413 | rank 7. executive: 0.010428434237837791 414 | rank 8. market: 0.0098022585734725 415 | rank 9. stock: 0.009284550324082375 416 | rank 10. chief: 0.008711854927241802 417 | ================================================== 418 | topic 33 419 | -------------------------------------------------- 420 | rank 1. consumer: 0.02195882610976696 421 | rank 2. percent: 0.020870916545391083 422 | rank 3. companies: 0.015635766088962555 423 | rank 4. industry: 0.015347079373896122 424 | rank 5. market: 0.014645704068243504 425 | rank 6. cost: 0.012568947859108448 426 | rank 7. customer: 0.012199653312563896 427 | rank 8. prices: 0.010143699124455452 428 | rank 9. high: 0.009660380892455578 429 | rank 10. worker: 0.006465692073106766 430 | ================================================== 431 | topic 34 432 | -------------------------------------------------- 433 | rank 1. season: 0.021334033459424973 434 | rank 2. team: 0.016839321702718735 435 | rank 3. game: 0.014815553091466427 436 | rank 4. inning: 0.014347057789564133 437 | rank 5. player: 0.013774506747722626 438 | rank 6. yankees: 0.011174232698976994 439 | rank 7. run: 0.010817022994160652 440 | rank 8. baseball: 0.01055373065173626 441 | rank 9. games: 0.010321191512048244 442 | rank 10. hit: 0.010284436866641045 443 | ================================================== 444 | topic 35 445 | -------------------------------------------------- 446 | rank 1. zzz_george_bush: 0.05796745792031288 447 | rank 2. zzz_al_gore: 0.04237228259444237 448 | rank 3. election: 0.022491727024316788 449 | rank 4. president: 0.020312432199716568 450 | rank 5. ballot: 0.019908472895622253 451 | rank 6. zzz_florida: 0.016183944419026375 452 | rank 7. presidential: 0.015332216396927834 453 | rank 8. votes: 0.01442129909992218 454 | rank 9. vote: 0.009808804839849472 455 | rank 10. zzz_bush: 0.00961968582123518 456 | ================================================== 457 | topic 36 458 | -------------------------------------------------- 459 | rank 1. palestinian: 0.02687947452068329 460 | rank 2. zzz_israel: 0.023833250626921654 461 | rank 3. zzz_israeli: 0.013304143212735653 462 | rank 4. soldier: 0.010826818645000458 463 | rank 5. peace: 0.010164049454033375 464 | rank 6. zzz_yasser_arafat: 0.009658769704401493 465 | rank 7. israeli: 0.009265914559364319 466 | rank 8. war: 0.00923923496156931 467 | rank 9. israelis: 0.008119330741465092 468 | rank 10. military: 0.007811776362359524 469 | ================================================== 470 | topic 37 471 | -------------------------------------------------- 472 | rank 1. death: 0.023664837703108788 473 | rank 2. prison: 0.016880618408322334 474 | rank 3. murder: 0.01633421890437603 475 | rank 4. book: 0.009351547807455063 476 | rank 5. killed: 0.009010221809148788 477 | rank 6. prisoner: 0.007692103274166584 478 | rank 7. killing: 0.007337935268878937 479 | rank 8. woman: 0.007256744429469109 480 | rank 9. victim: 0.007001840975135565 481 | rank 10. shooting: 0.006456068251281977 482 | ================================================== 483 | topic 38 484 | -------------------------------------------------- 485 | rank 1. million: 0.01617966778576374 486 | rank 2. newspaper: 0.009461159817874432 487 | rank 3. show: 0.006403861101716757 488 | rank 4. program: 0.005598483607172966 489 | rank 5. network: 0.0053542195819318295 490 | rank 6. money: 0.00485030934214592 491 | rank 7. according: 0.004323051776736975 492 | rank 8. special: 0.0040418170392513275 493 | rank 9. help: 0.004037346225231886 494 | rank 10. past: 0.0039222449995577335 495 | ================================================== 496 | topic 39 497 | -------------------------------------------------- 498 | rank 1. show: 0.022530050948262215 499 | rank 2. character: 0.009580017998814583 500 | rank 3. audience: 0.005444356705993414 501 | rank 4. television: 0.004325090907514095 502 | rank 5. series: 0.004303744062781334 503 | rank 6. look: 0.004119543824344873 504 | rank 7. love: 0.00407353974878788 505 | rank 8. film: 0.004058054182678461 506 | rank 9. find: 0.003848094493150711 507 | rank 10. young: 0.0036786773707717657 508 | ================================================== 509 | topic 40 510 | -------------------------------------------------- 511 | rank 1. drug: 0.047516606748104095 512 | rank 2. government: 0.012602291069924831 513 | rank 3. zzz_aid: 0.01227615773677826 514 | rank 4. zzz_india: 0.010664834640920162 515 | rank 5. countries: 0.008103608153760433 516 | rank 6. million: 0.007103894371539354 517 | rank 7. food: 0.006576470099389553 518 | rank 8. farmer: 0.006402278784662485 519 | rank 9. country: 0.006317282561212778 520 | rank 10. zzz_united_states: 0.0062563237734138966 521 | ================================================== 522 | topic 41 523 | -------------------------------------------------- 524 | rank 1. game: 0.026529431343078613 525 | rank 2. player: 0.022719431668519974 526 | rank 3. games: 0.0206462275236845 527 | rank 4. sport: 0.016915155574679375 528 | rank 5. fan: 0.012125855311751366 529 | rank 6. soccer: 0.011505456641316414 530 | rank 7. video: 0.010653939098119736 531 | rank 8. zzz_nbc: 0.009938360191881657 532 | rank 9. zzz_nba: 0.009428229182958603 533 | rank 10. team: 0.008263841271400452 534 | ================================================== 535 | topic 42 536 | -------------------------------------------------- 537 | rank 1. tax: 0.04971655085682869 538 | rank 2. cut: 0.026394149288535118 539 | rank 3. economy: 0.0230980534106493 540 | rank 4. economic: 0.017415864393115044 541 | rank 5. zzz_mexico: 0.01618388667702675 542 | rank 6. government: 0.01595328189432621 543 | rank 7. taxes: 0.014780825935304165 544 | rank 8. spending: 0.01243556011468172 545 | rank 9. income: 0.012374772690236568 546 | rank 10. zzz_social_security: 0.010477164760231972 547 | ================================================== 548 | topic 43 549 | -------------------------------------------------- 550 | rank 1. zzz_bush: 0.027270827442407608 551 | rank 2. bill: 0.024806691333651543 552 | rank 3. zzz_congress: 0.018335092812776566 553 | rank 4. zzz_white_house: 0.016858264803886414 554 | rank 5. federal: 0.01354345865547657 555 | rank 6. zzz_senate: 0.01329002995043993 556 | rank 7. plan: 0.012937983497977257 557 | rank 8. proposal: 0.010213974863290787 558 | rank 9. administration: 0.009349077008664608 559 | rank 10. health: 0.008263114839792252 560 | ================================================== 561 | topic 44 562 | -------------------------------------------------- 563 | rank 1. point: 0.020692508667707443 564 | rank 2. team: 0.018113387748599052 565 | rank 3. game: 0.015103872865438461 566 | rank 4. season: 0.013727625831961632 567 | rank 5. play: 0.012306117452681065 568 | rank 6. goal: 0.012093267403542995 569 | rank 7. games: 0.011415580287575722 570 | rank 8. shot: 0.011306485161185265 571 | rank 9. king: 0.011238034814596176 572 | rank 10. player: 0.008728481829166412 573 | ================================================== 574 | topic 45 575 | -------------------------------------------------- 576 | rank 1. player: 0.013769405893981457 577 | rank 2. point: 0.012727474793791771 578 | rank 3. win: 0.012649298645555973 579 | rank 4. play: 0.011700315400958061 580 | rank 5. round: 0.010591110214591026 581 | rank 6. season: 0.010317614302039146 582 | rank 7. shot: 0.01031588576734066 583 | rank 8. game: 0.00999273732304573 584 | rank 9. team: 0.009904314763844013 585 | rank 10. final: 0.009542282670736313 586 | ================================================== 587 | topic 46 588 | -------------------------------------------------- 589 | rank 1. zzz_china: 0.015810564160346985 590 | rank 2. oil: 0.014123033732175827 591 | rank 3. power: 0.013019545003771782 592 | rank 4. zzz_russia: 0.012522333301603794 593 | rank 5. energy: 0.01063102949410677 594 | rank 6. plant: 0.010357524268329144 595 | rank 7. gas: 0.00931472983211279 596 | rank 8. nuclear: 0.008214462548494339 597 | rank 9. missile: 0.007829232141375542 598 | rank 10. environmental: 0.007554346229881048 599 | ================================================== 600 | topic 47 601 | -------------------------------------------------- 602 | rank 1. com: 0.02512955479323864 603 | rank 2. zzz_laker: 0.015019885264337063 604 | rank 3. palm: 0.013598510064184666 605 | rank 4. daily: 0.013184287585318089 606 | rank 5. statesman: 0.013182769529521465 607 | rank 6. beach: 0.01314060389995575 608 | rank 7. question: 0.010342201218008995 609 | rank 8. zzz_eastern: 0.009052561596035957 610 | rank 9. information: 0.008214504458010197 611 | rank 10. austin: 0.007981293834745884 612 | ================================================== 613 | topic 48 614 | -------------------------------------------------- 615 | rank 1. film: 0.034848302602767944 616 | rank 2. movie: 0.02526075392961502 617 | rank 3. actor: 0.013231894932687283 618 | rank 4. movies: 0.008959283120930195 619 | rank 5. zzz_hollywood: 0.008070441894233227 620 | rank 6. play: 0.007740044500678778 621 | rank 7. theater: 0.00727312033995986 622 | rank 8. director: 0.005834080744534731 623 | rank 9. character: 0.005199376493692398 624 | rank 10. zzz_oscar: 0.004690317437052727 625 | ================================================== 626 | topic 49 627 | -------------------------------------------------- 628 | rank 1. patient: 0.02304932475090027 629 | rank 2. doctor: 0.01952706277370453 630 | rank 3. cancer: 0.011629555374383926 631 | rank 4. medical: 0.011445121839642525 632 | rank 5. disease: 0.011433145962655544 633 | rank 6. hospital: 0.009982189163565636 634 | rank 7. study: 0.008990893140435219 635 | rank 8. treatment: 0.007559608668088913 636 | rank 9. blood: 0.007204002235084772 637 | rank 10. test: 0.007000159937888384 638 | ================================================== 639 | topic 50 640 | -------------------------------------------------- 641 | rank 1. million: 0.028744814917445183 642 | rank 2. contract: 0.016937075182795525 643 | rank 3. agent: 0.009414087980985641 644 | rank 4. manager: 0.007703984156250954 645 | rank 5. business: 0.006961227394640446 646 | rank 6. high: 0.005569536704570055 647 | rank 7. club: 0.005377542693167925 648 | rank 8. past: 0.005371585488319397 649 | rank 9. career: 0.005363883450627327 650 | rank 10. hand: 0.005337761249393225 651 | -------------------------------------------------------------------------------- /examples/example_lda.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pylint: disable=no-name-in-module,logging-format-truncated 8 | # pylint: disable=too-few-public-methods 9 | import os 10 | from os.path import join as pjoin 11 | import time 12 | import pickle 13 | import subprocess 14 | 15 | import tqdm 16 | import fire 17 | import wget 18 | import h5py 19 | import numpy as np 20 | import pandas as pd 21 | 22 | # import gensim 23 | from gensim.models.ldamulticore import LdaMulticore 24 | 25 | from cusim import aux, CuLDA 26 | 27 | LOGGER = aux.get_logger() 28 | # DATASET = "nips" 29 | DATASET = "nytimes" 30 | DIR_PATH = "./res" 31 | BASE_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/" \ 32 | "bag-of-words/" 33 | 34 | def download(): 35 | if not os.path.exists(DIR_PATH): 36 | os.makedirs(DIR_PATH, exist_ok=True) 37 | 38 | if os.path.exists(pjoin(DIR_PATH, f"docword.{DATASET}.txt")): 39 | LOGGER.info("path %s already exists", 40 | pjoin(DIR_PATH, f"docword.{DATASET}.txt")) 41 | return 42 | 43 | # download docword 44 | filename = f"docword.{DATASET}.txt.gz" 45 | out_path = pjoin(DIR_PATH, filename) 46 | LOGGER.info("download %s to %s", BASE_URL + filename, out_path) 47 | wget.download(BASE_URL + filename, out=out_path) 48 | print() 49 | 50 | # decompress 51 | cmd = ["gunzip", "-c", out_path, ">", 52 | pjoin(DIR_PATH, f"docword.{DATASET}.txt")] 53 | cmd = " ".join(cmd) 54 | subprocess.call(cmd, shell=True) 55 | os.remove(pjoin(DIR_PATH, filename)) 56 | 57 | # download vocab 58 | filename = f"vocab.{DATASET}.txt" 59 | out_path = pjoin(DIR_PATH, filename) 60 | LOGGER.info("download %s to %s", BASE_URL + filename, out_path) 61 | wget.download(BASE_URL + filename, out=out_path) 62 | print() 63 | 64 | def run_cusim(): 65 | download() 66 | data_path = pjoin(DIR_PATH, f"docword.{DATASET}.txt") 67 | keys_path = pjoin(DIR_PATH, f"vocab.{DATASET}.txt") 68 | processed_data_path = pjoin(DIR_PATH, f"docword.{DATASET}.h5") 69 | opt = { 70 | "data_path": data_path, 71 | "processed_data_path": processed_data_path, 72 | "keys_path": keys_path, 73 | "num_topics": 50, 74 | "num_iters_in_e_step": 10, 75 | "reuse_gamma": True, 76 | # "skip_preprocess": os.path.exists(processed_data_path), 77 | } 78 | start = time.time() 79 | lda = CuLDA(opt) 80 | lda.train_model() 81 | el0 = time.time() - start 82 | LOGGER.info("elapsed for training LDA using cusim: %.4e sec", el0) 83 | h5_model_path = pjoin(DIR_PATH, "cusim.lda.model.h5") 84 | lda.save_h5_model(h5_model_path) 85 | show_cusim_topics(h5_model_path) 86 | return el0 87 | 88 | def show_cusim_topics(h5_model_path, topk=10): 89 | h5f = h5py.File(h5_model_path, "r") 90 | beta = h5f["beta"][:, :].T 91 | keys = h5f["keys"][:] 92 | show_topics(beta, keys, topk, "cusim.topics.txt") 93 | 94 | def build_gensim_corpus(): 95 | corpus_path = pjoin(DIR_PATH, f"docword.{DATASET}.pk") 96 | if os.path.exists(corpus_path): 97 | LOGGER.info("load corpus from %s", corpus_path) 98 | with open(corpus_path, "rb") as fin: 99 | ret = pickle.loads(fin.read()) 100 | return ret 101 | 102 | # get corpus for gensim lda 103 | data_path = pjoin(DIR_PATH, f"docword.{DATASET}.txt") 104 | LOGGER.info("build corpus from %s", data_path) 105 | docs, doc, curid = [], [], -1 106 | with open(data_path, "r") as fin: 107 | for idx, line in tqdm.tqdm(enumerate(fin)): 108 | if idx < 3: 109 | continue 110 | docid, wordid, count = line.strip().split() 111 | # zero-base id 112 | docid, wordid, count = int(docid) - 1, int(wordid) - 1, float(count) 113 | if 0 <= curid < docid: 114 | docs.append(doc) 115 | doc = [] 116 | doc.append((wordid, count)) 117 | curid = docid 118 | docs.append(doc) 119 | LOGGER.info("save corpus to %s", corpus_path) 120 | with open(corpus_path, "wb") as fout: 121 | fout.write(pickle.dumps(docs, 2)) 122 | return docs 123 | 124 | def run_gensim(): 125 | docs = build_gensim_corpus() 126 | keys_path = pjoin(DIR_PATH, f"vocab.{DATASET}.txt") 127 | LOGGER.info("load vocab from %s", keys_path) 128 | id2word = {} 129 | with open(keys_path, "rb") as fin: 130 | for idx, line in enumerate(fin): 131 | id2word[idx] = line.strip() 132 | 133 | start = time.time() 134 | lda = LdaMulticore(docs, num_topics=50, workers=None, 135 | id2word=id2word, iterations=10) 136 | el0 = time.time() - start 137 | LOGGER.info("elapsed for training lda using gensim: %.4e sec", el0) 138 | model_path = pjoin(DIR_PATH, "gensim.lda.model") 139 | LOGGER.info("save gensim lda model to %s", model_path) 140 | lda.save(model_path) 141 | show_gensim_topics(model_path) 142 | return el0 143 | 144 | def show_gensim_topics(model_path=None, topk=10): 145 | # load beta 146 | model_path = model_path or pjoin(DIR_PATH, "gensim.lda.model") 147 | LOGGER.info("load gensim lda model from %s", model_path) 148 | lda = LdaMulticore.load(model_path) 149 | beta = lda.state.get_lambda() 150 | beta /= np.sum(beta, axis=1)[:, None] 151 | 152 | # load keys 153 | keys_path = pjoin(DIR_PATH, f"vocab.{DATASET}.txt") 154 | LOGGER.info("load vocab from %s", keys_path) 155 | with open(keys_path, "rb") as fin: 156 | keys = [line.strip() for line in fin] 157 | show_topics(beta, keys, topk, "gensim.topics.txt") 158 | 159 | def show_topics(beta, keys, topk, result_path): 160 | LOGGER.info("save results to %s (topk: %d)", result_path, topk) 161 | fout = open(result_path, "w") 162 | for idx in range(beta.shape[0]): 163 | print("=" * 50) 164 | fout.write("=" * 50 + "\n") 165 | print(f"topic {idx + 1}") 166 | fout.write(f"topic {idx + 1}" + "\n") 167 | print("-" * 50) 168 | fout.write("-" * 50 + "\n") 169 | _beta = beta[idx, :] 170 | indices = np.argsort(-_beta)[:topk] 171 | for rank, wordid in enumerate(indices): 172 | word = keys[wordid].decode("utf8") 173 | prob = _beta[wordid] 174 | print(f"rank {rank + 1}. {word}: {prob}") 175 | fout.write(f"rank {rank + 1}. {word}: {prob}" + "\n") 176 | fout.close() 177 | 178 | 179 | def run_experiments(): 180 | training_time = {"attr": "training time (sec)"} 181 | training_time["gensim (8 vpus)"] = run_gensim() 182 | training_time["cusim"] = run_cusim() 183 | df0 = pd.DataFrame([training_time]) 184 | df0.set_index("attr", inplace=True) 185 | print(df0.to_markdown()) 186 | 187 | 188 | if __name__ == "__main__": 189 | fire.Fire() 190 | -------------------------------------------------------------------------------- /examples/example_w2v.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pylint: disable=no-name-in-module,logging-format-truncated 8 | # pylint: disable=too-few-public-methods 9 | import os 10 | import time 11 | import subprocess 12 | 13 | import tqdm 14 | import fire 15 | import pandas as pd 16 | 17 | import gensim 18 | from gensim import downloader as api 19 | from gensim.test.utils import datapath 20 | 21 | import nltk 22 | from nltk.tokenize import RegexpTokenizer 23 | 24 | from cusim import aux, CuW2V 25 | 26 | 27 | LOGGER = aux.get_logger() 28 | DOWNLOAD_PATH = "./res" 29 | DATASET = "quora-duplicate-questions" 30 | DATA_PATH = f"./res/{DATASET}.stream.txt" 31 | PROCESSED_DATA_DIR = f"./res/{DATASET}-processed" 32 | CUSIM_MODEL = "./res/cusim.w2v.model" 33 | GENSIM_MODEL = "./res/gensim.w2v.model" 34 | 35 | 36 | # common hyperparameters 37 | MIN_COUNT = 5 38 | LEARNING_RATE = 0.001 39 | NEG_SIZE = 10 40 | NUM_DIMS = 100 41 | CBOW_MEAN = False 42 | EPOCHS = 10 43 | 44 | 45 | def download(): 46 | if os.path.exists(DATA_PATH): 47 | LOGGER.info("%s already exists", DATA_PATH) 48 | return 49 | if not os.path.exists(DOWNLOAD_PATH): 50 | os.makedirs(DOWNLOAD_PATH, exist_ok=True) 51 | api.BASE_DIR = DOWNLOAD_PATH 52 | filepath = api.load(DATASET, return_path=True) 53 | LOGGER.info("filepath: %s", filepath) 54 | cmd = ["gunzip", "-c", filepath, ">", DATA_PATH] 55 | cmd = " ".join(cmd) 56 | LOGGER.info("cmd: %s", cmd) 57 | subprocess.call(cmd, shell=True) 58 | preprocess_data() 59 | 60 | def preprocess_data(): 61 | tokenizer = RegexpTokenizer(r'\w+') 62 | nltk.download("wordnet") 63 | lemmatizer = nltk.stem.wordnet.WordNetLemmatizer() 64 | fout = open(DATA_PATH + ".tmp", "wb") 65 | with open(DATA_PATH, "rb") as fin: 66 | for line in tqdm.tqdm(fin): 67 | line = line.decode("utf8").strip() 68 | line = preprocess_line(line, tokenizer, lemmatizer) 69 | fout.write((line + "\n").encode("utf8")) 70 | fout.close() 71 | os.rename(DATA_PATH + ".tmp", DATA_PATH) 72 | 73 | def preprocess_line(line, tokenizer, lemmatizer): 74 | line = line.lower() 75 | line = tokenizer.tokenize(line) 76 | line = [token for token in line 77 | if not token.isnumeric() and len(token) > 1] 78 | line = [lemmatizer.lemmatize(token) for token in line] 79 | return " ".join(line) 80 | 81 | def run_cusim(skip_gram=False, hierarchical_softmax=False): 82 | download() 83 | opt = { 84 | "data_path": DATA_PATH, 85 | "processed_data_dir": PROCESSED_DATA_DIR, 86 | # "skip_preprocess": os.path.exists(PROCESSED_DATA_DIR), 87 | "num_dims": NUM_DIMS, 88 | "epochs": EPOCHS, 89 | "word_min_count": MIN_COUNT, 90 | "lr": 0.001, 91 | "io": { 92 | "lower": False 93 | }, 94 | "neg": 0 if hierarchical_softmax else NEG_SIZE, 95 | "skip_gram": skip_gram, 96 | "cbow_mean": CBOW_MEAN, 97 | } 98 | start = time.time() 99 | w2v = CuW2V(opt) 100 | w2v.train_model() 101 | elapsed = time.time() - start 102 | LOGGER.info("elapsed for cusim w2v training: %.4e sec", elapsed) 103 | w2v.save_word2vec_format(CUSIM_MODEL, binary=False) 104 | return elapsed, evaluate_w2v_model(CUSIM_MODEL) 105 | 106 | def run_gensim(skip_gram=False, hierarchical_softmax=False, workers=8): 107 | download() 108 | start = time.time() 109 | model = gensim.models.Word2Vec(corpus_file=DATA_PATH, workers=workers, 110 | sg=skip_gram, hs=hierarchical_softmax, 111 | min_alpha=LEARNING_RATE, min_count=MIN_COUNT, 112 | alpha=LEARNING_RATE, negative=NEG_SIZE, 113 | iter=EPOCHS, cbow_mean=CBOW_MEAN, 114 | size=NUM_DIMS) 115 | elapsed = time.time() - start 116 | LOGGER.info("elapsed for gensim w2v training: %.4e sec", elapsed) 117 | model.wv.save_word2vec_format(GENSIM_MODEL, binary=False) 118 | LOGGER.info("gensim w2v model is saved to %s", GENSIM_MODEL) 119 | return elapsed, evaluate_w2v_model(GENSIM_MODEL) 120 | 121 | def evaluate_w2v_model(model=GENSIM_MODEL): 122 | LOGGER.info("load word2vec format model from %s", model) 123 | model = gensim.models.KeyedVectors.load_word2vec_format(model) 124 | results = model.wv.evaluate_word_pairs(datapath("wordsim353.tsv"), 125 | case_insensitive=False) 126 | LOGGER.info("evaluation results: %s", results) 127 | return results 128 | 129 | # gpu model variable is for being displayed in markdown 130 | # please put the real gpu modelname 131 | def run_experiments(skip_gram=False, hierarchical_softmax=False, 132 | gpu_model="NVIDIA T4"): 133 | training_time = {"attr": "training time (sec)"} 134 | pearson = {"attr": "pearson"} 135 | spearman = {"attr": "spearman"} 136 | for i in [1, 2, 4, 8]: 137 | elapsed, evals = run_gensim(skip_gram, hierarchical_softmax, i) 138 | training_time[f"{i} workers (gensim)"] = elapsed 139 | pearson[f"{i} workers (gensim)"] = evals[0][0] 140 | spearman[f"{i} workers (gensim)"] = evals[1][0] 141 | elapsed, evals = run_cusim(skip_gram, hierarchical_softmax) 142 | gpu_title = f"{gpu_model} (cusim)" 143 | training_time[gpu_title] = elapsed 144 | pearson[gpu_title] = evals[0][0] 145 | spearman[gpu_title] = evals[1][0] 146 | df0 = pd.DataFrame([training_time, pearson, spearman]) 147 | df0.set_index("attr", inplace=True) 148 | print(df0.to_markdown()) 149 | 150 | # gpu model variable is for being displayed in markdown 151 | # please put the real gpu modelname 152 | def run_various_experiments(gpu_model="NVIDIA T4"): 153 | for sg0 in [True, False]: 154 | for hs0 in [True, False]: 155 | print("=" * 100) 156 | LOGGER.info("setting: %s, %s", 157 | "skip gram" if sg0 else "cbow", 158 | "hierarchical softmax" if hs0 else "negative sampling") 159 | run_experiments(sg0, hs0, gpu_model) 160 | 161 | 162 | if __name__ == "__main__": 163 | fire.Fire() 164 | -------------------------------------------------------------------------------- /examples/requirements.txt: -------------------------------------------------------------------------------- 1 | fire 2 | gensim==3.8.3 3 | nltk 4 | tqdm 5 | wget 6 | pandas 7 | tabulate 8 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=1.3.2", 4 | "numpy", 5 | "pybind11" 6 | ] 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | h5py 2 | jsmin 3 | numpy 4 | scipy 5 | pybind11 6 | protobuf==3.10.0 7 | grpcio-tools==1.27.1 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021 Jisang Yoon 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the Apache 2.0 license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # pylint: disable=fixme,too-few-public-methods 8 | # reference: https://github.com/kakao/buffalo/blob/ 9 | # 5f571c2c7d8227e6625c6e538da929e4db11b66d/setup.py 10 | """cusim 11 | """ 12 | import os 13 | import sys 14 | import glob 15 | import pathlib 16 | import platform 17 | import sysconfig 18 | import subprocess 19 | from setuptools import setup, Extension 20 | 21 | import pybind11 22 | import numpy as np 23 | from cuda_setup import CUDA, BUILDEXT 24 | 25 | 26 | DOCLINES = __doc__.split("\n") 27 | 28 | # TODO: Python3 Support 29 | if sys.version_info[:3] < (3, 6): 30 | raise RuntimeError("Python version 3.6 or later required.") 31 | 32 | assert platform.system() == 'Linux' # TODO: MacOS 33 | with open("requirements.txt", "r") as fin: 34 | INSTALL_REQUIRES = [line.strip() for line in fin] 35 | 36 | MAJOR = 0 37 | MINOR = 0 38 | MICRO = 2 39 | RELEASE = True 40 | STAGE = {True: '', False: 'b'}.get(RELEASE) 41 | VERSION = f'{MAJOR}.{MINOR}.{MICRO}{STAGE}' 42 | STATUS = {False: 'Development Status :: 4 - Beta', 43 | True: 'Development Status :: 5 - Production/Stable'} 44 | 45 | CLASSIFIERS = """{status} 46 | Programming Language :: C++ 47 | Programming Language :: Python :: 3.6 48 | Operating System :: POSIX :: Linux 49 | Operating System :: Unix 50 | Operating System :: MacOS 51 | License :: OSI Approved :: Apache Software License""".format( \ 52 | status=STATUS.get(RELEASE)) 53 | CLIB_DIR = os.path.join(sysconfig.get_path('purelib'), 'cusim') 54 | LIBRARY_DIRS = [CLIB_DIR] 55 | 56 | 57 | def get_extend_compile_flags(): 58 | flags = ['-march=native'] 59 | return flags 60 | 61 | 62 | class CMakeExtension(Extension): 63 | extension_type = 'cmake' 64 | 65 | def __init__(self, name): 66 | super().__init__(name, sources=[]) 67 | 68 | 69 | extend_compile_flags = get_extend_compile_flags() 70 | extra_compile_args = ['-fopenmp', '-std=c++14', '-ggdb', '-O3'] + \ 71 | extend_compile_flags 72 | util_srcs = glob.glob("cpp/src/utils/*.cc") 73 | extensions = [ 74 | Extension("cusim.ioutils.ioutils_bind", 75 | sources = util_srcs + [ \ 76 | "cusim/ioutils/bindings.cc", 77 | "3rd/json11/json11.cpp"], 78 | language="c++", 79 | extra_compile_args=extra_compile_args, 80 | extra_link_args=["-fopenmp"], 81 | extra_objects=[], 82 | include_dirs=[ \ 83 | "cpp/include/", np.get_include(), pybind11.get_include(), 84 | pybind11.get_include(True), 85 | "3rd/json11", "3rd/spdlog/include"]), 86 | Extension("cusim.culda.culda_bind", 87 | sources= util_srcs + [ \ 88 | "cpp/src/culda/culda.cu", 89 | "cusim/culda/bindings.cc", 90 | "3rd/json11/json11.cpp"], 91 | language="c++", 92 | extra_compile_args=extra_compile_args, 93 | extra_link_args=["-fopenmp"], 94 | library_dirs=[CUDA['lib64']], 95 | libraries=['cudart', 'curand'], 96 | extra_objects=[], 97 | include_dirs=[ \ 98 | "cpp/include/", np.get_include(), pybind11.get_include(), 99 | pybind11.get_include(True), CUDA['include'], 100 | "3rd/json11", "3rd/spdlog/include"]), 101 | Extension("cusim.cuw2v.cuw2v_bind", 102 | sources= util_srcs + [ \ 103 | "cpp/src/cuw2v/cuw2v.cu", 104 | "cusim/cuw2v/bindings.cc", 105 | "3rd/json11/json11.cpp"], 106 | language="c++", 107 | extra_compile_args=extra_compile_args, 108 | extra_link_args=["-fopenmp"], 109 | library_dirs=[CUDA['lib64']], 110 | libraries=['cudart', 'curand'], 111 | extra_objects=[], 112 | include_dirs=[ \ 113 | "cpp/include/", np.get_include(), pybind11.get_include(), 114 | pybind11.get_include(True), CUDA['include'], 115 | "3rd/json11", "3rd/spdlog/include"]), 116 | ] 117 | 118 | 119 | # Return the git revision as a string 120 | def git_version(): 121 | def _minimal_ext_cmd(cmd): 122 | # construct minimal environment 123 | env = {} 124 | for k in ['SYSTEMROOT', 'PATH']: 125 | val = os.environ.get(k) 126 | if val is not None: 127 | env[k] = val 128 | out = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env). \ 129 | communicate()[0] 130 | return out 131 | 132 | try: 133 | out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD']) 134 | git_revision = out.strip().decode('ascii') 135 | except OSError: 136 | git_revision = "Unknown" 137 | 138 | return git_revision 139 | 140 | 141 | def write_version_py(filename='cusim/version.py'): 142 | cnt = """ 143 | short_version = '%(version)s' 144 | git_revision = '%(git_revision)s' 145 | """ 146 | git_revision = git_version() 147 | with open(filename, 'w') as fout: 148 | fout.write(cnt % {'version': VERSION, 149 | 'git_revision': git_revision}) 150 | 151 | 152 | class BuildExtension(BUILDEXT): 153 | def run(self): 154 | for ext in self.extensions: 155 | print(ext.name) 156 | if hasattr(ext, 'extension_type') and ext.extension_type == 'cmake': 157 | self.cmake() 158 | super().run() 159 | 160 | def cmake(self): 161 | cwd = pathlib.Path().absolute() 162 | 163 | build_temp = pathlib.Path(self.build_temp) 164 | build_temp.mkdir(parents=True, exist_ok=True) 165 | 166 | build_type = 'Debug' if self.debug else 'Release' 167 | 168 | cmake_args = [ 169 | '-DCMAKE_BUILD_TYPE=' + build_type, 170 | '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + CLIB_DIR, 171 | ] 172 | 173 | build_args = [] 174 | 175 | os.chdir(str(build_temp)) 176 | self.spawn(['cmake', str(cwd)] + cmake_args) 177 | if not self.dry_run: 178 | self.spawn(['cmake', '--build', '.'] + build_args) 179 | os.chdir(str(cwd)) 180 | 181 | 182 | def setup_package(): 183 | write_version_py() 184 | cmdclass = { 185 | 'build_ext': BuildExtension 186 | } 187 | 188 | metadata = dict( 189 | name='cusim', 190 | maintainer="Jisang Yoon", 191 | maintainer_email="vjs10101v@gmail.com", 192 | author="Jisang Yoon", 193 | author_email="vjs10101v@gmail.com", 194 | description=DOCLINES[0], 195 | long_description="\n".join(DOCLINES[2:]), 196 | url="https://github.com/js1010/cusim", 197 | download_url="https://github.com/js1010/cusim/releases", 198 | include_package_data=False, 199 | license='Apache2', 200 | packages=['cusim/', "cusim/ioutils/", "cusim/culda/", "cusim/cuw2v/"], 201 | install_requires=INSTALL_REQUIRES, 202 | cmdclass=cmdclass, 203 | classifiers=[_f for _f in CLASSIFIERS.split('\n') if _f], 204 | platforms=['Linux', 'Mac OSX', 'Unix'], 205 | ext_modules=extensions, 206 | entry_points={ 207 | 'console_scripts': [ 208 | ] 209 | }, 210 | python_requires='>=3.6', 211 | ) 212 | 213 | metadata['version'] = VERSION 214 | setup(**metadata) 215 | 216 | 217 | if __name__ == '__main__': 218 | setup_package() 219 | --------------------------------------------------------------------------------