├── .gitmodules
├── .readthedocs.yml
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.md
├── cpp
    ├── include
    │   ├── culda
    │   │   ├── cuda_lda_kernels.cuh
    │   │   └── culda.hpp
    │   ├── cuw2v
    │   │   ├── cuda_w2v_base_kernels.cuh
    │   │   ├── cuda_w2v_hs_kernels.cuh
    │   │   ├── cuda_w2v_ns_kernels.cuh
    │   │   └── cuw2v.hpp
    │   └── utils
    │   │   ├── cuda_utils_kernels.cuh
    │   │   ├── ioutils.hpp
    │   │   ├── log.hpp
    │   │   └── types.hpp
    └── src
    │   ├── culda
    │       └── culda.cu
    │   ├── cuw2v
    │       └── cuw2v.cu
    │   └── utils
    │       ├── ioutils.cc
    │       └── log.cc
├── cuda_setup.py
├── cusim
    ├── .gitignore
    ├── __init__.py
    ├── aux.py
    ├── constants.py
    ├── culda
    │   ├── __init__.py
    │   ├── bindings.cc
    │   └── pyculda.py
    ├── cuw2v
    │   ├── __init__.py
    │   ├── bindings.cc
    │   └── pycuw2v.py
    ├── ioutils
    │   ├── __init__.py
    │   ├── bindings.cc
    │   └── pyioutils.py
    └── proto
    │   └── config.proto
├── docs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    ├── install.rst
    ├── lda.rst
    ├── make.bat
    └── w2v.rst
├── examples
    ├── README.md
    ├── cusim.topics.txt
    ├── example_lda.py
    ├── example_w2v.py
    ├── gensim.topics.txt
    └── requirements.txt
├── pyproject.toml
├── requirements.txt
└── setup.py


/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "3rd/json11"]
2 | 	path = 3rd/json11
3 | 	url = https://github.com/dropbox/json11
4 | [submodule "3rd/spdlog"]
5 | 	path = 3rd/spdlog
6 | 	url = https://github.com/gabime/spdlog
7 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | 
3 | sphinx:
4 |   builder: html
5 |   configuration: docs/conf.py
6 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # reference: https://github.com/jeremad/cuda-travis/blob/master/.travis.yml
 2 | language: cpp
 3 | 
 4 | sudo: enabled
 5 | 
 6 | compiler:
 7 |   - gcc
 8 | 
 9 | matrix:
10 |   include:
11 |     - name: CUDA 10
12 |       env:
13 |       - CUDA=10.1.105-1
14 |       - CUDA_SHORT=10.1
15 |       - UBUNTU_VERSION=ubuntu1804
16 |       dist: bionic
17 | 
18 | before_install:
19 |   - sudo apt update
20 |   - sudo apt install -y software-properties-common
21 |   - sudo add-apt-repository -y ppa:deadsnakes/ppa
22 |   - sudo apt update
23 |   - sudo apt install -y python3-pip python3.6 g++
24 |   - pip3 install -U pip
25 |   - pip3 install setuptools
26 |   - pip3 install -r requirements.txt
27 |   - INSTALLER=cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb
28 |   - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${INSTALLER}
29 |   - sudo dpkg -i ${INSTALLER}
30 |   - wget https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub
31 |   - sudo apt-key add 7fa2af80.pub
32 |   - sudo apt update -qq
33 |   - sudo apt install -y cuda-core-${CUDA_SHORT/./-} cuda-cudart-dev-${CUDA_SHORT/./-} cuda-curand-dev-${CUDA_SHORT/./-} cuda-cufft-dev-${CUDA_SHORT/./-} 
34 |   - sudo apt clean
35 |   - export CUDA_HOME=/usr/local/cuda-${CUDA_SHORT}
36 |   - export LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
37 |   - export PATH=${CUDA_HOME}/bin:${PATH}
38 |   - python3.6 -m grpc_tools.protoc --python_out cusim/ --proto_path cusim/proto/ config.proto
39 | 
40 | script:
41 |   - sudo python3.6 setup.py install
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include cuda_setup.py
 2 | include requirements.txt
 3 | include pyproject.toml
 4 | recursive-include cpp/src/cuw2v/ *.cu
 5 | recursive-include cpp/src/culda/ *.cu
 6 | recursive-include cpp/src/utils/ *.cc
 7 | recursive-include cpp/include/cuw2v/ *.cuh
 8 | recursive-include cpp/include/cuw2v/ *.hpp
 9 | recursive-include cpp/include/culda/ *.cuh
10 | recursive-include cpp/include/culda/ *.hpp
11 | recursive-include cpp/include/utils/ *.cuh
12 | recursive-include cpp/include/utils/ *.hpp
13 | recursive-include 3rd/json11/ *
14 | recursive-include 3rd/spdlog/ *
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CUSIM
  2 | 
  3 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) [![Build Status](https://travis-ci.org/js1010/cusim.svg?branch=main)](https://travis-ci.org/js1010/cusim) [![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/dwyl/learn-travis/issues) [![Documentation Status](https://readthedocs.org/projects/cusim/badge/?version=latest)](https://cusim.readthedocs.io/en/latest/?badge=latest)
  4 | 
  5 | Superfast CUDA implementation of Word2Vec and Latent Dirichlet Allocation (LDA)
  6 | 
  7 | ### Introduction
  8 | 
  9 | This project is to speed up various ML models (e.g. topic modeling, word embedding, etc) by CUDA. It would be nice to think of it as [gensim](https://github.com/RaRe-Technologies/gensim)'s GPU version project. As a starting step, I implemented the most widely used word embedding model, the [word2vec](https://arxiv.org/pdf/1301.3781.pdf) model, and the most representative topic model, the [LDA (Latent Dirichlet Allocation)](https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf) model.
 10 | 
 11 | ### Requirements
 12 | 
 13 | - Python3.6+
 14 | - gcc / g++ (>= 5.1 for c++14)
 15 | - cuda >= 7.0
 16 | - Tested on Ubuntu 18.04 / GCC 7.5 / CUDA 11.1 / Python 3.6
 17 | 
 18 | ### How to install
 19 | 
 20 | - install from pypi
 21 | 
 22 | ```shell
 23 | pip install cusim
 24 | ```
 25 | 
 26 | 
 27 | - install from source
 28 | 
 29 | ```shell
 30 | # clone repo and submodules
 31 | git clone git@github.com:js1010/cusim.git && cd cusim && git submodule update --init
 32 | 
 33 | # install requirements
 34 | pip install -r requirements.txt
 35 | 
 36 | # generate proto
 37 | python -m grpc_tools.protoc --python_out cusim/ --proto_path cusim/proto/ config.proto
 38 | 
 39 | # install
 40 | python setup.py install
 41 | ```
 42 | 
 43 | ### How to use
 44 | 
 45 | - `examples/example_w2v.py`, `examples/example_lda.py` and `examples/README.md` will be very helpful to understand the usage.
 46 | - paremeter description can be seen in `cusim/proto/config.proto`
 47 | 
 48 | ### Performance
 49 | 
 50 | - [AWS g4dn 2xlarge instance](https://aws.amazon.com/ec2/instance-types/g4/) is used to the experiment. (One NVIDIA T4 GPU with 8 vcpus, Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz)
 51 | - results can be reproduced by simply running `examples/example_w2v.py` and `examples/example_lda.py`
 52 | - To evaluate w2v model, I used `evaluate_word_pairs` function ([ref link](https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#evaluating)) in gensim, note that better performance on WS-353 test set does not necessarily mean that the model will workbetter in application as desribed on the link. However, it is good to be measured quantitively and fast training time will be at least very objective measure of the performaance.
 53 |   - I trained W2V model on `quora-duplicate-questions` dataset from gensim downloader api on GPU with cusim and compare the performance (both speed and model quality) with gensim.
 54 | - To evaluate LDA model, I found there is no good way to measure the quality of traing results quantitatively. But we can check the model by looking at the top words of each topic. Also, we can compare the training time quantitatively.
 55 | - W2V (skip gram, hierarchical softmax)
 56 | 
 57 | | attr                |   1 workers (gensim) |   2 workers (gensim) |   4 workers (gensim) |   8 workers (gensim) |   NVIDIA T4 (cusim) |
 58 | |:--------------------|---------------------:|---------------------:|---------------------:|---------------------:|--------------------:|
 59 | | training time (sec) |           892.596    |           544.212    |           310.727    |           226.472    |       **16.162**   |
 60 | | pearson             |             0.487832 |             0.487696 |             0.482821 |             0.487136 |       **0.492101** |
 61 | | spearman            |             0.500846 |             0.506214 |             0.501048 |         **0.506718** |            0.479468 |
 62 | 
 63 | - W2V (skip gram, negative sampling)
 64 | 
 65 | | attr                |   1 workers (gensim) |   2 workers (gensim) |   4 workers (gensim) |   8 workers (gensim) |   NVIDIA T4 (cusim) |
 66 | |:--------------------|---------------------:|---------------------:|---------------------:|---------------------:|--------------------:|
 67 | | training time (sec) |           586.545    |           340.489    |           220.804    |           146.23     |       **33.9173**   |
 68 | | pearson             |             0.354448 |             0.353952 |             0.352398 |             0.352925 |        **0.360436** |
 69 | | spearman            |             0.369146 |             0.369365 |         **0.370565** |             0.365822 |        0.355204     |
 70 | 
 71 | - W2V (CBOW, hierarchical softmax)
 72 | 
 73 | | attr                |   1 workers (gensim) |   2 workers (gensim) |   4 workers (gensim) |   8 workers (gensim) |   NVIDIA T4 (cusim) |
 74 | |:--------------------|---------------------:|---------------------:|---------------------:|---------------------:|--------------------:|
 75 | | training time (sec) |           250.135    |           155.121    |           103.57     |            73.8073   |        **6.20787**  |
 76 | | pearson             |             0.309651 |             0.321803 |             0.324854 |             0.314255 |        **0.480298** |
 77 | | spearman            |             0.294047 |             0.308723 |             0.318293 |             0.300591 |        **0.480971** |
 78 | 
 79 | - W2V (CBOW, negative sampling)
 80 | 
 81 | | attr                |   1 workers (gensim) |   2 workers (gensim) |   4 workers (gensim) |   8 workers (gensim) |   NVIDIA T4 (cusim) |
 82 | |:--------------------|---------------------:|---------------------:|---------------------:|---------------------:|--------------------:|
 83 | | training time (sec) |           176.923    |           100.369    |            69.7829   |            49.9274   |        **9.90391**  |
 84 | | pearson             |             0.18772  |             0.193152 |             0.204509 |             0.187924 |        **0.368202** |
 85 | | spearman            |             0.243975 |             0.24587  |             0.260531 |             0.237441 |        **0.358042** |
 86 | 
 87 | - LDA (`nytimes` dataset from https://archive.ics.uci.edu/ml/datasets/bag+of+words)
 88 |   - I found that setting `workers` variable in gensim LdaMulticore does not work properly (it uses all cores in instance anyway), so I just compared the speed between cusim with single GPU and gensim with 8 vcpus. 
 89 |   - One can compare the quality of modeling by looking at `examples/cusim.topics.txt` and `examples/gensim.topics.txt`.
 90 | 
 91 | | attr                |   gensim (8 vpus) |   cusim (NVIDIA T4)|
 92 | |:--------------------|------------------:|--------:|
 93 | | training time (sec) |           447.376 | **76.6972** |
 94 | 
 95 | ### Future tasks
 96 | 
 97 | - support half precision
 98 | - support multi device (multi device implementation on LDA model will not be that hard, while multi device training on w2v may require some considerations)
 99 | - implement other models such as [FastText](https://arxiv.org/pdf/1607.04606.pdf), [BERT](https://arxiv.org/pdf/1810.04805.pdf), etc
100 | - **contribution is always welcome**
101 | 


--------------------------------------------------------------------------------
/cpp/include/culda/cuda_lda_kernels.cuh:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2021 Jisang Yoon
  2 | // All rights reserved.
  3 | //
  4 | // This source code is licensed under the Apache 2.0 license found in the
  5 | // LICENSE file in the root directory of this source tree.
  6 | #pragma once
  7 | #include "utils/cuda_utils_kernels.cuh"
  8 | 
  9 | 
 10 | namespace cusim {
 11 | 
 12 | // reference: http://web.science.mq.edu.au/~mjohnson/code/digamma.c
 13 | __inline__ __device__
 14 | float Digamma(float x) {
 15 |   float result = 0.0f, xx, xx2, xx4;
 16 |   for ( ; x < 7.0f; ++x)
 17 |     result -= 1.0f / x;
 18 |   x -= 0.5f;
 19 |   xx = 1.0f / x;
 20 |   xx2 = xx * xx;
 21 |   xx4 = xx2 * xx2;
 22 |   result += logf(x) + 1.0f / 24.0f * xx2 
 23 |     - 7.0f / 960.0f * xx4 + 31.0f / 8064.0f * xx4 * xx2 
 24 |     - 127.0f / 30720.0f * xx4 * xx4;
 25 |   return result;
 26 | }
 27 | 
 28 | __global__ void EstepKernel(
 29 |   const int* cols, const int* indptr, 
 30 |   const bool* vali, const float* counts,
 31 |   const bool init_gamma, const int num_cols, const int num_indptr, 
 32 |   const int num_topics, const int num_iters,
 33 |   const float* alpha, const float* beta,
 34 |   float* gamma, float* grad_alpha, float* new_beta, 
 35 |   float* train_losses, float* vali_losses, int* locks) {
 36 |   
 37 |   // storage for block
 38 |   extern __shared__ float shared_memory[];
 39 |   float* _new_gamma = &shared_memory[0];
 40 |   float* _phi = &shared_memory[num_topics];
 41 |   float* _loss_vec = &shared_memory[num_topics * 2];
 42 |   float* _vali_phi_sum = &shared_memory[num_topics * 3];
 43 | 
 44 |   float* _grad_alpha = grad_alpha + num_topics * blockIdx.x;
 45 | 
 46 |   for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) {
 47 |     int beg = indptr[i], end = indptr[i + 1];
 48 |     float* _gamma = gamma + num_topics * i;
 49 |     if (init_gamma) {
 50 |       for (int j = threadIdx.x; j < num_topics; j += blockDim.x) {
 51 |         _gamma[j] = alpha[j] + (end - beg) / num_topics;
 52 |       }
 53 |     }
 54 |     __syncthreads();
 55 |     
 56 |     // initiate phi sum for validation data for computing vali loss 
 57 |     for (int j = threadIdx.x; j < num_topics; j += blockDim.x)
 58 |       _vali_phi_sum[j] = 0.0f;
 59 | 
 60 |     // iterate E step
 61 |     for (int j = 0; j < num_iters; ++j) {
 62 |       // initialize new gamma
 63 |       for (int k = threadIdx.x; k < num_topics; k += blockDim.x)
 64 |         _new_gamma[k] = 0.0f;
 65 |       __syncthreads();
 66 | 
 67 |       // compute phi from gamma
 68 |       for (int k = beg; k < end; ++k) {
 69 |         const int w = cols[k];
 70 |         const bool _vali = vali[k];
 71 |         const float c = counts[k]; 
 72 |         // compute phi
 73 |         if (not _vali or j + 1 == num_iters) {
 74 |           for (int l = threadIdx.x; l < num_topics; l += blockDim.x)
 75 |             _phi[l] = beta[w * num_topics + l] * expf(Digamma(_gamma[l]));
 76 |           __syncthreads();
 77 |           
 78 |           // normalize phi and add it to new gamma and new beta
 79 |           float phi_sum = ReduceSum(_phi, num_topics);
 80 | 
 81 |           for (int l = threadIdx.x; l < num_topics; l += blockDim.x) {
 82 |             _phi[l] /= phi_sum;
 83 |             
 84 |             // update gamma for train data and phi_sum for computing loss
 85 |             if (_vali) 
 86 |               _vali_phi_sum[l] += _phi[l] * c;
 87 |             else
 88 |               _new_gamma[l] += _phi[l] * c;
 89 |           
 90 |           }
 91 |           __syncthreads();
 92 |         }
 93 |         
 94 |         if (j + 1 == num_iters) {
 95 |           // update beta for train data
 96 |           if (not _vali) {
 97 |             // write access of w th vector of new_beta 
 98 |             if (threadIdx.x == 0) {
 99 |               while (atomicCAS(&locks[w], 0, 1)) {}
100 |             } 
101 | 
102 |             __syncthreads();
103 |             for (int l = threadIdx.x; l < num_topics; l += blockDim.x)
104 |               new_beta[w * num_topics + l] += _phi[l] * c;
105 |             __syncthreads();
106 | 
107 |             // release lock
108 |             if (threadIdx.x == 0) locks[w] = 0;
109 |             __syncthreads();
110 |           }
111 |           
112 |           // comput loss and reset shared mem
113 |           // see Eq (15) in https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf
114 |           for (int l = threadIdx.x; l < num_topics; l += blockDim.x) {
115 |             _loss_vec[l] = logf(fmaxf(beta[w * num_topics + l], EPS));
116 |             _loss_vec[l] -= logf(fmaxf(_phi[l], EPS));
117 |             _loss_vec[l] *= _phi[l];
118 |           }
119 |           __syncthreads();
120 |           float _loss = ReduceSum(_loss_vec, num_topics) * c;
121 |           if (threadIdx.x == 0) {
122 |             if (_vali) 
123 |               vali_losses[blockIdx.x] += _loss;
124 |             else
125 |               train_losses[blockIdx.x] += _loss;
126 |           }
127 |           __syncthreads();
128 | 
129 |         }
130 |         __syncthreads();
131 |       }
132 | 
133 |       // update gamma
134 |       for (int k = threadIdx.x; k < num_topics; k += blockDim.x)
135 |         _gamma[k] = _new_gamma[k] + alpha[k];
136 |       __syncthreads();
137 |     }
138 | 
139 |     // update gradient of alpha and loss from E[log(theta)]
140 |     float gamma_sum = ReduceSum(_gamma, num_topics);
141 |     for (int j = threadIdx.x; j < num_topics; j += blockDim.x) {
142 |       float Elogthetad = Digamma(_gamma[j]) - Digamma(gamma_sum);
143 |       _grad_alpha[j] += Elogthetad;
144 |       _new_gamma[j] *= Elogthetad;
145 |       _vali_phi_sum[j] *= Elogthetad;
146 |     }
147 |     
148 |     // see Eq (15) in https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf
149 |     float train_loss = ReduceSum(_new_gamma, num_topics);
150 |     float vali_loss = ReduceSum(_vali_phi_sum, num_topics);
151 |     if (threadIdx.x == 0) {
152 |       train_losses[blockIdx.x] += train_loss;
153 |       vali_losses[blockIdx.x] += vali_loss;
154 |     }
155 | 
156 |     __syncthreads();
157 |   } 
158 | }
159 | 
160 | }  // cusim
161 | 


--------------------------------------------------------------------------------
/cpp/include/culda/culda.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2021 Jisang Yoon
 2 | // All rights reserved.
 3 | //
 4 | // This source code is licensed under the Apache 2.0 license found in the
 5 | // LICENSE file in the root directory of this source tree.
 6 | #pragma once
 7 | #include <thrust/copy.h>
 8 | #include <thrust/fill.h>
 9 | #include <thrust/random.h>
10 | #include <thrust/host_vector.h>
11 | #include <thrust/device_vector.h>
12 | #include <thrust/binary_search.h>
13 | #include <thrust/execution_policy.h>
14 | 
15 | #include <omp.h>
16 | #include <set>
17 | #include <random>
18 | #include <memory>
19 | #include <string>
20 | #include <fstream>
21 | #include <utility>
22 | #include <queue>
23 | #include <deque>
24 | #include <functional>
25 | #include <vector>
26 | #include <cmath>
27 | #include <chrono> // NOLINT
28 | 
29 | #include "json11.hpp"
30 | #include "utils/log.hpp"
31 | #include "utils/types.hpp"
32 | 
33 | namespace cusim {
34 | 
35 | 
36 | // reference: https://people.math.sc.edu/Burkardt/cpp_src/asa121/asa121.cpp
37 | inline float Trigamma(float x) {
38 |   const float a = 0.0001f;
39 |   const float b = 5.0f;
40 |   const float b2 =  0.1666666667f;
41 |   const float b4 = -0.03333333333f;
42 |   const float b6 =  0.02380952381f;
43 |   const float b8 = -0.03333333333f;
44 |   float value = 0, y = 0, z = x;
45 |   if (x <= a) return 1.0f / x / x;
46 |   while (z < b) {
47 |     value += 1.0f / z / z;
48 |     z++;
49 |   }
50 |   y = 1.0f / z / z;
51 |   value += value + 0.5 * y + (1.0
52 |     + y * (b2
53 |     + y * (b4
54 |     + y * (b6
55 |     + y * b8)))) / z;
56 |   return value;
57 | }
58 | 
59 | 
60 | class CuLDA {
61 |  public:
62 |   CuLDA();
63 |   ~CuLDA();
64 |   bool Init(std::string opt_path);
65 |   void LoadModel(float* alpha, float* beta,
66 |       float* grad_alpha, float* new_beta, const int num_words);
67 |   std::pair<float, float> FeedData(
68 |       const int* indices, const int* indptr,
69 |       const bool* vali, const float* counts,
70 |       float* gamma, const bool init_gamma,
71 |       const int num_indices, const int num_indptr,
72 |       const int num_iters);
73 |   void Pull();
74 |   void Push();
75 |   int GetBlockCnt();
76 | 
77 |  private:
78 |   DeviceInfo dev_info_;
79 |   json11::Json opt_;
80 |   std::shared_ptr<spdlog::logger> logger_;
81 |   std::unique_ptr<CuSimLogger> logger_container_;
82 |   thrust::device_vector<float> dev_alpha_, dev_beta_;
83 |   thrust::device_vector<float> dev_grad_alpha_, dev_new_beta_;
84 |   thrust::device_vector<int> dev_locks_;
85 | 
86 |   float *alpha_, *beta_, *grad_alpha_, *new_beta_;
87 |   int block_cnt_, block_dim_;
88 |   int num_topics_, num_words_;
89 | };
90 | 
91 | } // namespace cusim
92 | 


--------------------------------------------------------------------------------
/cpp/include/cuw2v/cuda_w2v_base_kernels.cuh:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2021 Jisang Yoon
 2 | // All rights reserved.
 3 | //
 4 | // This source code is licensed under the Apache 2.0 license found in the
 5 | // LICENSE file in the root directory of this source tree.
 6 | #pragma once
 7 | #include "utils/cuda_utils_kernels.cuh"
 8 | 
 9 | #define MAX_EXP 20
10 | 
11 | namespace cusim {
12 | 
13 | 
14 | __inline__ __device__
15 | void PositiveFeedback(const float* vec1, float* vec2, float* grad, 
16 |     float& loss_nume, float& loss_deno, const int num_dims, const float lr) {
17 |   static __shared__ float g;
18 |   float dot = fmaxf(-MAX_EXP, fminf(MAX_EXP, Dot(vec1, vec2, num_dims)));
19 |   if (threadIdx.x == 0) {
20 |     float exp_dot = expf(-dot);
21 |     g = exp_dot / (1 + exp_dot) * lr;
22 |     loss_nume += logf(1 + exp_dot);
23 |     loss_deno++;
24 |   }
25 |   __syncthreads();
26 |   for (int i = threadIdx.x; i < num_dims; i += blockDim.x) {
27 |     grad[i] += vec2[i] * g;
28 |     vec2[i] += vec1[i] * g;
29 |   }
30 |   __syncthreads();
31 | }
32 | 
33 | __inline__ __device__
34 | void NegativeFeedback(const float* vec1, float* vec2, float* grad, 
35 |     float& loss_nume, float& loss_deno, const int num_dims, const float lr) {
36 |   static __shared__ float g;
37 |   float dot = fmaxf(-MAX_EXP, fminf(MAX_EXP, Dot(vec1, vec2, num_dims)));
38 |   if (threadIdx.x == 0) {
39 |     float exp_dot = expf(dot);
40 |     g = exp_dot / (1 + exp_dot) * lr;
41 |     loss_nume += logf(1 + exp_dot);
42 |     loss_deno++;
43 |   }
44 |   __syncthreads();
45 |   for (int i = threadIdx.x; i < num_dims; i += blockDim.x) {
46 |     grad[i] -= vec2[i] * g;
47 |     vec2[i] -= vec1[i] * g;
48 |   }
49 |   __syncthreads();
50 | }
51 | 
52 | }  // cusim
53 | 


--------------------------------------------------------------------------------
/cpp/include/cuw2v/cuda_w2v_hs_kernels.cuh:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2021 Jisang Yoon
  2 | // All rights reserved.
  3 | //
  4 | // This source code is licensed under the Apache 2.0 license found in the
  5 | // LICENSE file in the root directory of this source tree.
  6 | #pragma once
  7 | #include "utils/cuda_utils_kernels.cuh"
  8 | #include "cuw2v/cuda_w2v_base_kernels.cuh"
  9 | 
 10 | 
 11 | namespace cusim {
 12 | 
 13 | __global__ void W2VHsSgKernel(
 14 |   const int* cols, const int* indptr,
 15 |   const bool* codes, const int* points, const int* hs_indptr,
 16 |   const int num_indptr, const int num_dims, const int window_size,
 17 |   default_random_engine* rngs,
 18 |   float* emb_in, float* emb_out, 
 19 |   float* loss_nume, float* loss_deno, const float lr) {
 20 |   
 21 |   default_random_engine& rng = rngs[blockIdx.x];
 22 |   float& _loss_nume = loss_nume[blockIdx.x];
 23 |   float& _loss_deno = loss_deno[blockIdx.x];
 24 | 
 25 |   uniform_int_distribution<int> dist_window(0, window_size - 1);
 26 |   static __shared__ int reduced_windows;
 27 |   extern __shared__ float shared_memory[];
 28 |   float* grad = &shared_memory[0];
 29 | 
 30 |   // zero-initialize shared mem
 31 |   for (int i = threadIdx.x; i < num_dims; i += blockDim.x)
 32 |     grad[i] = 0.0f;
 33 |   __syncthreads();
 34 | 
 35 |   for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) {
 36 |     int beg = indptr[i], end = indptr[i + 1];
 37 |     for (int j = beg; j < end; ++j) {
 38 |       if (threadIdx.x == 0) reduced_windows = dist_window(rng);
 39 |       __syncthreads();
 40 |       int beg2 = max(beg, j - window_size + reduced_windows);
 41 |       int end2 = min(end, j + window_size - reduced_windows + 1);
 42 |       for (int k = beg2; k < end2; ++k) {
 43 |         if (k == j) continue;
 44 |         float* _emb_in = emb_in + num_dims * cols[k];
 45 |         int beg3 = hs_indptr[cols[j]];
 46 |         int end3 = hs_indptr[cols[j] + 1];
 47 |         for (int l = beg3; l < end3; ++l) {
 48 |           if (codes[l]) {
 49 |             PositiveFeedback(_emb_in, emb_out + num_dims * points[l],
 50 |                 grad, _loss_nume, _loss_deno, num_dims, lr);
 51 |           } else {
 52 |             NegativeFeedback(_emb_in, emb_out + num_dims * points[l],
 53 |                 grad, _loss_nume, _loss_deno, num_dims, lr);
 54 |           }
 55 |           __syncthreads();
 56 |         }
 57 |         for (int l = threadIdx.x; l < num_dims; l += blockDim.x) {
 58 |           _emb_in[l] += grad[l];
 59 |           grad[l] = 0.0f;
 60 |         }
 61 |         __syncthreads();
 62 |       }
 63 |     }
 64 |   } 
 65 | }
 66 | 
 67 | __global__ void W2VHsCbowKernel(
 68 |   const int* cols, const int* indptr,
 69 |   const bool* codes, const int* points, const int* hs_indptr,
 70 |   const int num_indptr, const int num_dims, const int window_size, default_random_engine* rngs,
 71 |   float* emb_in, float* emb_out, 
 72 |   float* loss_nume, float* loss_deno, 
 73 |   const bool cbow_mean, const float lr) {
 74 |   
 75 |   default_random_engine& rng = rngs[blockIdx.x];
 76 |   float& _loss_nume = loss_nume[blockIdx.x];
 77 |   float& _loss_deno = loss_deno[blockIdx.x];
 78 | 
 79 |   uniform_int_distribution<int> dist_window(0, window_size - 1);
 80 |   static __shared__ int reduced_windows;
 81 |   extern __shared__ float shared_memory[];
 82 |   float* grad = &shared_memory[0];
 83 |   float* cbow = &shared_memory[num_dims];
 84 | 
 85 |   __syncthreads();
 86 | 
 87 |   for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) {
 88 |     int beg = indptr[i], end = indptr[i + 1];
 89 |     for (int j = beg; j < end; ++j) {
 90 |       if (threadIdx.x == 0) reduced_windows = dist_window(rng);
 91 |       __syncthreads();
 92 |       int beg2 = max(beg, j - window_size + reduced_windows);
 93 |       int end2 = min(end, j + window_size - reduced_windows + 1);
 94 |       if (end2 - beg2 <= 1) continue;
 95 |       
 96 |       // zero-initialize shared mem
 97 |       for (int k = threadIdx.x; k < num_dims; k += blockDim.x) {
 98 |         grad[k] = 0.0f;
 99 |         cbow[k] = 0.0f;
100 |       }
101 |       
102 |       // compute cbow
103 |       for (int k = beg2; k < end2; ++k) {
104 |         if (k == j) continue;
105 |         for (int l = threadIdx.x; l < num_dims; l += blockDim.x) {
106 |           cbow[l] += emb_in[num_dims * cols[k] + l];
107 |         }
108 |       }
109 |       if (cbow_mean) {
110 |         for (int k = threadIdx.x; k < num_dims; k += blockDim.x) {
111 |           cbow[k] /= (end2 - beg2 - 1);
112 |         }
113 |       }
114 |       __syncthreads();
115 |        
116 |       int beg3 = hs_indptr[cols[j]];
117 |       int end3 = hs_indptr[cols[j] + 1];
118 |       for (int k = beg3; k < end3; ++k) {
119 |         if (codes[k]) {
120 |           PositiveFeedback(cbow, emb_out + num_dims * points[k],
121 |               grad, _loss_nume, _loss_deno, num_dims, lr);
122 |         } else {
123 |           NegativeFeedback(cbow, emb_out + num_dims * points[k],
124 |               grad, _loss_nume, _loss_deno, num_dims, lr);
125 |         }
126 |         __syncthreads();
127 |       }
128 |       
129 |       // normalize grad if cbow_mean = true
130 |       if (cbow_mean) {
131 |         for (int k = threadIdx.x; k < num_dims; k += blockDim.x) {
132 |           grad[k] /= (end2 - beg2 - 1);
133 |         }
134 |       }
135 |       __syncthreads();
136 | 
137 |       // update emb_in
138 |       for (int k = beg2; k < end2; ++k) {
139 |         if (k == j) continue;
140 |         for (int l = threadIdx.x; l < num_dims; l += blockDim.x) {
141 |           emb_in[num_dims * cols[k] + l] += grad[l];
142 |         } 
143 |         __syncthreads();
144 |       }
145 |     }
146 |   } 
147 | }
148 | 
149 | }  // cusim
150 | 


--------------------------------------------------------------------------------
/cpp/include/cuw2v/cuda_w2v_ns_kernels.cuh:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2021 Jisang Yoon
  2 | // All rights reserved.
  3 | //
  4 | // This source code is licensed under the Apache 2.0 license found in the
  5 | // LICENSE file in the root directory of this source tree.
  6 | #pragma once
  7 | #include "utils/cuda_utils_kernels.cuh"
  8 | #include "cuw2v/cuda_w2v_base_kernels.cuh"
  9 | 
 10 | 
 11 | namespace cusim {
 12 | 
 13 | __global__ void W2VNegSgKernel(
 14 |   const int* cols, const int* indptr,
 15 |   const int* random_table, default_random_engine* rngs, const int random_size,
 16 |   const int num_indptr, const int num_dims, const int neg, const int window_size,
 17 |   float* emb_in, float* emb_out, float* loss_nume, float* loss_deno, const float lr) {
 18 |   
 19 |   default_random_engine& rng = rngs[blockIdx.x];
 20 |   float& _loss_nume = loss_nume[blockIdx.x];
 21 |   float& _loss_deno = loss_deno[blockIdx.x];
 22 | 
 23 |   uniform_int_distribution<int> dist_neg(0, random_size - 1);
 24 |   uniform_int_distribution<int> dist_window(0, window_size - 1);
 25 |   __shared__ int reduced_windows;
 26 |   __shared__ int neg_word;
 27 |   extern __shared__ float shared_memory[];
 28 |   float* grad = &shared_memory[0];
 29 | 
 30 |   // zero-initialize shared mem
 31 |   for (int i = threadIdx.x; i < num_dims; i += blockDim.x)
 32 |     grad[i] = 0.0f;
 33 |   __syncthreads();
 34 | 
 35 |   for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) {
 36 |     int beg = indptr[i], end = indptr[i + 1];
 37 |     for (int j = beg; j < end; ++j) {
 38 |       if (threadIdx.x == 0) reduced_windows = dist_window(rng);
 39 |       __syncthreads();
 40 |       int beg2 = max(beg, j - window_size + reduced_windows);
 41 |       int end2 = min(end, j + window_size - reduced_windows + 1);
 42 |       for (int k = beg2; k < end2; ++k) {
 43 |         if (k == j) continue;
 44 |         float* _emb_in = emb_in + num_dims * cols[k];
 45 |         PositiveFeedback(_emb_in, emb_out + num_dims * cols[j], 
 46 |             grad, _loss_nume, _loss_deno, num_dims, lr);
 47 |         for (int l = 0; l < neg; ++l) {
 48 |           if (threadIdx.x == 0) neg_word = random_table[dist_neg(rng)];
 49 |           __syncthreads();
 50 |           if (neg_word == cols[j]) continue;
 51 |           NegativeFeedback(_emb_in, emb_out + num_dims * neg_word, 
 52 |               grad, _loss_nume, _loss_deno, num_dims, lr);
 53 |         }
 54 |         __syncthreads();
 55 |         for (int l = threadIdx.x; l < num_dims; l += blockDim.x) {
 56 |           _emb_in[l] += grad[l];
 57 |           grad[l] = 0.0f;
 58 |         }
 59 |         __syncthreads();
 60 |       }
 61 |     }
 62 |   } 
 63 | }
 64 | 
 65 | __global__ void W2VNegCbowKernel(
 66 |   const int* cols, const int* indptr,
 67 |   const int* random_table, default_random_engine* rngs, const int random_size,
 68 |   const int num_indptr, const int num_dims, const int neg, const int window_size, 
 69 |   float* emb_in, float* emb_out, 
 70 |   float* loss_nume, float* loss_deno, const bool cbow_mean, const float lr) {
 71 |   
 72 |   default_random_engine& rng = rngs[blockIdx.x];
 73 |   float& _loss_nume = loss_nume[blockIdx.x];
 74 |   float& _loss_deno = loss_deno[blockIdx.x];
 75 | 
 76 |   uniform_int_distribution<int> dist_neg(0, random_size - 1);
 77 |   uniform_int_distribution<int> dist_window(0, window_size - 1);
 78 |   static __shared__ int reduced_windows;
 79 |   static __shared__ int neg_word;
 80 |   extern __shared__ float shared_memory[];
 81 |   float* grad = &shared_memory[0];
 82 |   float* cbow = &shared_memory[num_dims];
 83 | 
 84 |   __syncthreads();
 85 | 
 86 |   for (int i = blockIdx.x; i < num_indptr; i += gridDim.x) {
 87 |     int beg = indptr[i], end = indptr[i + 1];
 88 |     for (int j = beg; j < end; ++j) {
 89 |       if (threadIdx.x == 0) reduced_windows = dist_window(rng);
 90 |       __syncthreads();
 91 |       int beg2 = max(beg, j - window_size + reduced_windows);
 92 |       int end2 = min(end, j + window_size - reduced_windows + 1);
 93 |       if (end2 - beg2 <= 1) continue;
 94 |       
 95 |       // zero-initialize shared mem
 96 |       for (int k = threadIdx.x; k < num_dims; k += blockDim.x) {
 97 |         grad[k] = 0.0f;
 98 |         cbow[k] = 0.0f;
 99 |       }
100 | 
101 |       // compute cbow
102 |       for (int k = beg2; k < end2; ++k) {
103 |         if (k == j) continue;
104 |         for (int l = threadIdx.x; l < num_dims; l += blockDim.x) {
105 |           cbow[l] += emb_in[num_dims * cols[k] + l];
106 |         }
107 |       }
108 |       if (cbow_mean) {
109 |         for (int k = threadIdx.x; k < num_dims; k += blockDim.x) {
110 |           cbow[k] /= (end2 - beg2 - 1);
111 |         }
112 |       }
113 |       __syncthreads();
114 |       
115 |       PositiveFeedback(cbow, emb_out + num_dims * cols[j], grad,
116 |           _loss_nume, _loss_deno, num_dims, lr);
117 |       __syncthreads();
118 |       
119 |       // update negative feedback
120 |       for (int k = 0; k < neg; ++k){
121 |         if (threadIdx.x == 0) neg_word = random_table[dist_neg(rng)];
122 |         __syncthreads();
123 |         if (neg_word == cols[j]) continue;
124 |         NegativeFeedback(cbow, emb_out + num_dims * neg_word, 
125 |             grad, _loss_nume, _loss_deno, num_dims, lr);
126 |       }
127 |       __syncthreads();
128 |       
129 |       // normalize grad if cbow_mean = true
130 |       if (cbow_mean) {
131 |         for (int k = threadIdx.x; k < num_dims; k += blockDim.x) {
132 |           grad[k] /= (end2 - beg2 - 1);
133 |         }
134 |       }
135 |       __syncthreads();
136 | 
137 |       // update emb_in
138 |       for (int k = beg2; k < end2; ++k) {
139 |         if (k == j) continue; 
140 |         for (int l = threadIdx.x; l < num_dims; l += blockDim.x)
141 |           emb_in[num_dims * cols[k] + l] += grad[l];
142 |       }
143 |       __syncthreads();
144 | 
145 |     }
146 |   } 
147 | }
148 | 
149 | }  // cusim
150 | 


--------------------------------------------------------------------------------
/cpp/include/cuw2v/cuw2v.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2021 Jisang Yoon
 2 | // All rights reserved.
 3 | //
 4 | // This source code is licensed under the Apache 2.0 license found in the
 5 | // LICENSE file in the root directory of this source tree.
 6 | #pragma once
 7 | #include <thrust/copy.h>
 8 | #include <thrust/fill.h>
 9 | #include <thrust/random.h>
10 | #include <thrust/host_vector.h>
11 | #include <thrust/device_vector.h>
12 | #include <thrust/binary_search.h>
13 | #include <thrust/execution_policy.h>
14 | 
15 | #include <omp.h>
16 | #include <set>
17 | #include <random>
18 | #include <memory>
19 | #include <string>
20 | #include <fstream>
21 | #include <utility>
22 | #include <queue>
23 | #include <deque>
24 | #include <functional>
25 | #include <vector>
26 | #include <cmath>
27 | #include <chrono> // NOLINT
28 | 
29 | #include "json11.hpp"
30 | #include "utils/log.hpp"
31 | #include "utils/types.hpp"
32 | 
33 | using thrust::random::default_random_engine;
34 | 
35 | namespace cusim {
36 | 
37 | class CuW2V {
38 |  public:
39 |   CuW2V();
40 |   ~CuW2V();
41 |   bool Init(std::string opt_path);
42 |   void LoadModel(float* emb_in, float* emb_out);
43 |   void BuildHuffmanTree(const float* word_count, const int num_words);
44 |   void BuildRandomTable(const double* word_count, const int num_words, const int table_size);
45 |   std::pair<float, float> FeedData(const int* cols, const int* indptr,
46 |       const int num_cols, const int num_indptr);
47 |   void Pull();
48 | 
49 |  private:
50 |   DeviceInfo dev_info_;
51 |   json11::Json opt_;
52 |   std::shared_ptr<spdlog::logger> logger_;
53 |   std::unique_ptr<CuSimLogger> logger_container_;
54 |   int block_cnt_, block_dim_;
55 |   int num_dims_, num_words_, window_size_;
56 |   float *emb_in_, *emb_out_, lr_;
57 |   thrust::device_vector<float> dev_emb_in_, dev_emb_out_;
58 | 
59 |   // variables to construct huffman tree
60 |   int max_depth_;
61 |   thrust::device_vector<bool> dev_codes_;
62 |   thrust::device_vector<int> dev_points_, dev_hs_indptr_;
63 | 
64 |   // related to negative sampling / hierarchical softmax and skip gram / cbow
65 |   bool sg_, cbow_mean_;
66 |   int neg_;
67 | 
68 |   // variables to construct random table
69 |   thrust::device_vector<int> dev_random_table_;
70 |   int random_size_, seed_;
71 |   thrust::device_vector<default_random_engine> dev_rngs_;
72 | };
73 | 
74 | } // namespace cusim
75 | 


--------------------------------------------------------------------------------
/cpp/include/utils/cuda_utils_kernels.cuh:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2021 Jisang Yoon
  2 | // All rights reserved.
  3 | //
  4 | // This source code is licensed under the Apache 2.0 license found in the
  5 | // LICENSE file in the root directory of this source tree.
  6 | #pragma once
  7 | #include <unistd.h>
  8 | // #include <cublas_v2.h>
  9 | #include <cuda_runtime.h>
 10 | 
 11 | #include <thrust/copy.h>
 12 | #include <thrust/fill.h>
 13 | #include <thrust/random.h>
 14 | #include <thrust/host_vector.h>
 15 | #include <thrust/device_vector.h>
 16 | #include <thrust/binary_search.h>
 17 | #include <thrust/execution_policy.h>
 18 | #include <cooperative_groups.h>
 19 | 
 20 | #include <stdexcept>
 21 | #include <sstream>
 22 | #include <ctime>
 23 | #include <utility>
 24 | #include "utils/types.hpp"
 25 | 
 26 | using thrust::random::default_random_engine;
 27 | using thrust::random::uniform_int_distribution;
 28 | 
 29 | namespace cusim {
 30 | 
 31 | // Error Checking utilities, checks status codes from cuda calls
 32 | // and throws exceptions on failure (which cython can proxy back to python)
 33 | #define CHECK_CUDA(code) { checkCuda((code), __FILE__, __LINE__); }
 34 | inline void checkCuda(cudaError_t code, const char *file, int line) {
 35 |   if (code != cudaSuccess) {
 36 |     std::stringstream err;
 37 |     err << "Cuda Error: " << cudaGetErrorString(code) << " (" << file << ":" << line << ")";
 38 |     throw std::runtime_error(err.str());
 39 |   }
 40 | }
 41 | 
 42 | // inline const char* cublasGetErrorString(cublasStatus_t status) {
 43 | //   switch (status) {
 44 | //     case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
 45 | //     case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
 46 | //     case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
 47 | //     case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
 48 | //     case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
 49 | //     case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
 50 | //     case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
 51 | //     case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
 52 | //   }
 53 | //   return "Unknown";
 54 | // }
 55 | // 
 56 | // #define CHECK_CUBLAS(code) { checkCublas((code), __FILE__, __LINE__); }
 57 | // inline void checkCublas(cublasStatus_t code, const char * file, int line) {
 58 | //   if (code != CUBLAS_STATUS_SUCCESS) {
 59 | //     std::stringstream err;
 60 | //     err << "cublas error: " << cublasGetErrorString(code)
 61 | //         << " (" << file << ":" << line << ")";
 62 | //     throw std::runtime_error(err.str());
 63 | //   }
 64 | // }
 65 | 
 66 | inline DeviceInfo GetDeviceInfo() {
 67 |   DeviceInfo ret;
 68 |   CHECK_CUDA(cudaGetDevice(&ret.devId));
 69 |   cudaDeviceProp prop;
 70 |   CHECK_CUDA(cudaGetDeviceProperties(&prop, ret.devId));
 71 |   ret.mp_cnt = prop.multiProcessorCount;
 72 |   ret.major = prop.major;
 73 |   ret.minor = prop.minor;
 74 |   // reference: https://stackoverflow.com/a/32531982
 75 |   switch (ret.major) {
 76 |     case 2: // Fermi
 77 |       if (ret.minor == 1)
 78 |         ret.cores = ret.mp_cnt * 48;
 79 |       else
 80 |         ret.cores = ret.mp_cnt * 32;
 81 |       break;
 82 |     case 3: // Kepler
 83 |       ret.cores = ret.mp_cnt * 192;
 84 |       break;
 85 |     case 5: // Maxwell
 86 |       ret.cores = ret.mp_cnt * 128;
 87 |       break;
 88 |     case 6: // Pascal
 89 |       if (ret.minor == 1 or ret.minor == 2)
 90 |         ret.cores = ret.mp_cnt * 128;
 91 |       else if (ret.minor == 0)
 92 |         ret.cores = ret.mp_cnt * 64;
 93 |       else
 94 |         ret.unknown = true;
 95 |       break;
 96 |     case 7: // Volta and Turing
 97 |       if (ret.minor == 0 or ret.minor == 5)
 98 |         ret.cores = ret.mp_cnt * 64;
 99 |       else
100 |         ret.unknown = true;
101 |       break;
102 |     case 8: // Ampere
103 |       if (ret.minor == 0)
104 |         ret.cores = ret.mp_cnt * 64;
105 |       else if (ret.minor == 6)
106 |         ret.cores = ret.mp_cnt * 128;
107 |       else
108 |         ret.unknown = true;
109 |       break;
110 |     default:
111 |         ret.unknown = true;
112 |       break;
113 |   }
114 |   if (ret.cores == -1) ret.cores = ret.mp_cnt * 128;
115 |   return ret;
116 | }
117 | 
118 | __inline__ __device__
119 | float warp_reduce_sum(float val) {
120 |   #if __CUDACC_VER_MAJOR__ >= 9
121 |   // __shfl_down is deprecated with cuda 9+. use newer variants
122 |   unsigned int active = __activemask();
123 |   #pragma unroll
124 |   for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
125 |       val += __shfl_down_sync(active, val, offset);
126 |   }
127 |   #else
128 |   #pragma unroll
129 |   for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
130 |       val += __shfl_down(val, offset);
131 |   }
132 |   #endif
133 |   return val;
134 | }
135 | 
136 | __inline__ __device__
137 | float Dot(const float* vec1, const float* vec2, const int length) {
138 |   
139 |   static __shared__ float shared[32];
140 | 
141 |   // figure out the warp/ position inside the warp
142 |   int warp =  threadIdx.x / WARP_SIZE;
143 |   int lane = threadIdx.x % WARP_SIZE;
144 |   
145 |   // paritial sum
146 |   float val = 0.0f;
147 |   for (int i = threadIdx.x; i < length; i += blockDim.x) 
148 |     val += vec1[i] * vec2[i];
149 |   val = warp_reduce_sum(val);
150 |   
151 |   // write out the partial reduction to shared memory if appropiate
152 |   if (lane == 0) {
153 |     shared[warp] = val;
154 |   }
155 |   __syncthreads();
156 |   
157 |   // if we we don't have multiple warps, we're done
158 |   if (blockDim.x <= WARP_SIZE) {
159 |     return shared[0];
160 |   }
161 | 
162 |   // otherwise reduce again in the first warp
163 |   val = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane]: 0.0f;
164 |   if (warp == 0) {
165 |     val = warp_reduce_sum(val);
166 |     // broadcast back to shared memory
167 |     if (threadIdx.x == 0) {
168 |         shared[0] = val;
169 |     }
170 |   }
171 |   __syncthreads();
172 |   return shared[0];
173 | }
174 | 
175 | __inline__ __device__
176 | float ReduceSum(const float* vec, const int length) {
177 |   
178 |   static __shared__ float shared[32];
179 | 
180 |   // figure out the warp/ position inside the warp
181 |   int warp =  threadIdx.x / WARP_SIZE;
182 |   int lane = threadIdx.x % WARP_SIZE;
183 |   
184 |   // paritial sum
185 |   float val = 0.0f;
186 |   for (int i = threadIdx.x; i < length; i += blockDim.x) 
187 |     val += vec[i];
188 |   val = warp_reduce_sum(val);
189 |   
190 |   // write out the partial reduction to shared memory if appropiate
191 |   if (lane == 0) {
192 |     shared[warp] = val;
193 |   }
194 |   __syncthreads();
195 |   
196 |   // if we we don't have multiple warps, we're done
197 |   if (blockDim.x <= WARP_SIZE) {
198 |     return shared[0];
199 |   }
200 | 
201 |   // otherwise reduce again in the first warp
202 |   val = (threadIdx.x < blockDim.x / WARP_SIZE) ? shared[lane]: 0.0f;
203 |   if (warp == 0) {
204 |     val = warp_reduce_sum(val);
205 |     // broadcast back to shared memory
206 |     if (threadIdx.x == 0) {
207 |         shared[0] = val;
208 |     }
209 |   }
210 |   __syncthreads();
211 |   return shared[0];
212 | }
213 | 
214 | __global__ void InitRngsKernel(default_random_engine* rngs, int rand_seed) {
215 |   rngs[blockIdx.x].seed(blockIdx.x + rand_seed);
216 | }
217 | 
218 | } // namespace cusim
219 | 


--------------------------------------------------------------------------------
/cpp/include/utils/ioutils.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2021 Jisang Yoon
 2 | // All rights reserved.
 3 | //
 4 | // This source code is licensed under the Apache 2.0 license found in the
 5 | // LICENSE file in the root directory of this source tree.
 6 | #pragma once
 7 | 
 8 | #include <set>
 9 | #include <tuple>
10 | #include <random>
11 | #include <memory>
12 | #include <string>
13 | #include <fstream>
14 | #include <utility>
15 | #include <queue>
16 | #include <deque>
17 | #include <sstream>
18 | #include <functional>
19 | #include <vector>
20 | #include <cmath>
21 | #include <chrono> // NOLINT
22 | #include <iostream>
23 | #include <unordered_map>
24 | 
25 | #include "json11.hpp"
26 | #include "utils/log.hpp"
27 | 
28 | namespace cusim {
29 | 
30 | class IoUtils {
31 |  public:
32 |   IoUtils();
33 |   ~IoUtils();
34 |   bool Init(std::string opt_path);
35 |   int64_t LoadStreamFile(std::string filepath);
36 |   std::pair<int, int> ReadStreamForVocab(int num_lines, int num_threads);
37 |   std::pair<int, int> TokenizeStream(int num_lines, int num_threads);
38 |   void GetWordVocab(int min_count, std::string keys_path, std::string count_path);
39 |   void GetToken(int* rows, int* cols, int* indptr);
40 |   std::tuple<int64_t, int, int64_t> ReadBagOfWordsHeader(std::string filepath);
41 |   void ReadBagOfWordsContent(int64_t* rows, int* cols, float* counts, const int num_lines);
42 | 
43 |  private:
44 |   void ParseLine(std::string line, std::vector<std::string>& line_vec);
45 |   void ParseLineImpl(std::string line, std::vector<std::string>& line_vec);
46 | 
47 |   std::vector<std::vector<int>> cols_;
48 |   std::vector<int> indptr_;
49 |   std::mutex global_lock_;
50 |   std::ifstream fin_;
51 |   json11::Json opt_;
52 |   std::shared_ptr<spdlog::logger> logger_;
53 |   std::unique_ptr<CuSimLogger> logger_container_;
54 |   std::unordered_map<std::string, int> word_idmap_, word_count_;
55 |   std::vector<std::string> word_list_;
56 |   int64_t num_lines_, remain_lines_;
57 |   bool lower_;
58 | };  // class IoUtils
59 | 
60 | } // namespace cusim
61 | 


--------------------------------------------------------------------------------
/cpp/include/utils/log.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020 Jisang Yoon
 2 | // All rights reserved.
 3 | //
 4 | // This source code is licensed under the Apache 2.0 license found in the
 5 | // LICENSE file in the root directory of this source tree.
 6 | 
 7 | // reference: https://github.com/kakao/buffalo/blob/5f571c2c7d8227e6625c6e538da929e4db11b66d/lib/misc/log.cc
 8 | #pragma once
 9 | #include <memory>
10 | #include <string>
11 | #define SPDLOG_EOL ""
12 | #define SPDLOG_TRACE_ON
13 | #include "spdlog/spdlog.h"
14 | #include "spdlog/sinks/stdout_color_sinks.h"
15 | 
16 | #define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
17 | 
18 | #define INFO(x, ...) logger_->info("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__);
19 | #define DEBUG(x, ...) logger_->debug("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__);
20 | #define WARN(x, ...) logger_->warn("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__);
21 | #define TRACE(x, ...) logger_->trace("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__);
22 | #define CRITICAL(x, ...) logger_->critical("[{}:{}] " x "\n", __FILENAME__, __LINE__, __VA_ARGS__);
23 | 
24 | #define INFO0(x) logger_->info("[{}:{}] " x "\n", __FILENAME__, __LINE__);
25 | #define DEBUG0(x) logger_->debug("[{}:{}] " x "\n", __FILENAME__, __LINE__);
26 | #define WARN0(x) logger_->warn("[{}:{}] " x "\n", __FILENAME__, __LINE__);
27 | #define TRACE0(x) logger_->trace("[{}:{}] " x "\n", __FILENAME__, __LINE__);
28 | #define CRITICAL0(x) logger_->critical("[{}:{}] " x "\n", __FILENAME__, __LINE__);
29 | 
30 | namespace cusim {
31 | 
32 | class CuSimLogger {
33 |  public:
34 |   CuSimLogger();
35 |   explicit CuSimLogger(std::string name);
36 |   std::shared_ptr<spdlog::logger>& get_logger();
37 |   void set_log_level(int level);
38 |   int get_log_level();
39 | 
40 |  private:
41 |   static int global_logging_level_;
42 |   std::shared_ptr<spdlog::logger> logger_;
43 | };  // class CuSimLogger
44 | 
45 | }  // namespace cusim
46 | 


--------------------------------------------------------------------------------
/cpp/include/utils/types.hpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2021 Jisang Yoon
 2 | // All rights reserved.
 3 | //
 4 | // This source code is licensed under the Apache 2.0 license found in the
 5 | // LICENSE file in the root directory of this source tree.
 6 | #pragma once
 7 | 
 8 | struct DeviceInfo {
 9 |   int devId, mp_cnt, major, minor, cores;
10 |   bool unknown = false;
11 | };
12 | 
13 | #define WARP_SIZE 32
14 | #define EPS 1e-10f
15 | 


--------------------------------------------------------------------------------
/cpp/src/culda/culda.cu:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2021 Jisang Yoon
  2 | // All rights reserved.
  3 | //
  4 | // This source code is licensed under the Apache 2.0 license found in the
  5 | // LICENSE file in the root directory of this source tree.
  6 | #include "culda/culda.hpp"
  7 | #include "culda/cuda_lda_kernels.cuh"
  8 | 
  9 | namespace cusim {
 10 | 
 11 | CuLDA::CuLDA() {
 12 |   logger_container_.reset(new CuSimLogger("lda"));
 13 |   logger_ = logger_container_->get_logger();
 14 |   dev_info_ = GetDeviceInfo();
 15 |   if (dev_info_.unknown) DEBUG0("Unknown device type");
 16 |   INFO("cuda device info, major: {}, minor: {}, multi processors: {}, cores: {}",
 17 |        dev_info_.major, dev_info_.minor, dev_info_.mp_cnt, dev_info_.cores);
 18 | }
 19 | 
 20 | CuLDA::~CuLDA() {}
 21 | 
 22 | bool CuLDA::Init(std::string opt_path) {
 23 |   std::ifstream in(opt_path.c_str());
 24 |   if (not in.is_open()) return false;
 25 | 
 26 |   std::string str((std::istreambuf_iterator<char>(in)),
 27 |       std::istreambuf_iterator<char>());
 28 |   std::string err_cmt;
 29 |   auto _opt = json11::Json::parse(str, err_cmt);
 30 |   if (not err_cmt.empty()) return false;
 31 |   opt_ = _opt;
 32 |   logger_container_->set_log_level(opt_["c_log_level"].int_value());
 33 |   num_topics_ = opt_["num_topics"].int_value();
 34 |   block_dim_ = opt_["block_dim"].int_value();
 35 |   block_cnt_ = opt_["hyper_threads"].number_value() * (dev_info_.cores / block_dim_);
 36 |   INFO("num_topics: {}, block_dim: {}, block_cnt: {}", num_topics_, block_dim_, block_cnt_);
 37 |   return true;
 38 | }
 39 | 
 40 | void CuLDA::LoadModel(float* alpha, float* beta, 
 41 |     float* grad_alpha, float* new_beta, int num_words) {
 42 |   num_words_ = num_words;
 43 |   DEBUG("copy model({} x {})", num_words_, num_topics_);
 44 |   dev_alpha_.resize(num_topics_);
 45 |   dev_beta_.resize(num_topics_ * num_words_);
 46 |   thrust::copy(alpha, alpha + num_topics_, dev_alpha_.begin());
 47 |   thrust::copy(beta, beta + num_topics_ * num_words_, dev_beta_.begin());
 48 |   alpha_ = alpha; beta_ = beta;
 49 |   
 50 |   // resize device vector
 51 |   grad_alpha_ = grad_alpha;
 52 |   new_beta_ = new_beta;
 53 |   dev_grad_alpha_.resize(num_topics_ * block_cnt_);
 54 |   dev_new_beta_.resize(num_topics_ * num_words_);
 55 |   // copy to device
 56 |   thrust::copy(grad_alpha_, grad_alpha_ + block_cnt_ * num_topics_, dev_grad_alpha_.begin());
 57 |   thrust::copy(new_beta_, new_beta_ + num_words_ * num_topics_, dev_new_beta_.begin());
 58 |   // set locks
 59 |   dev_locks_.resize(num_words_);
 60 |   std::vector<int> host_locks(num_words_, 0);
 61 |   thrust::copy(host_locks.begin(), host_locks.end(), dev_locks_.begin());
 62 |   
 63 |   CHECK_CUDA(cudaDeviceSynchronize());
 64 | }
 65 | 
 66 | std::pair<float, float> CuLDA::FeedData(
 67 |     const int* cols, const int* indptr, 
 68 |     const bool* vali, const float* counts, float* gamma,
 69 |     const bool init_gamma, const int num_cols, const int num_indptr, 
 70 |     const int num_iters) {
 71 |   
 72 |   // copy feed data to GPU memory
 73 |   thrust::device_vector<int> dev_cols(num_cols);
 74 |   thrust::device_vector<int> dev_indptr(num_indptr + 1);
 75 |   thrust::device_vector<bool> dev_vali(num_cols);
 76 |   thrust::device_vector<float> dev_counts(num_cols);
 77 |   thrust::device_vector<float> dev_gamma(num_indptr * num_topics_);
 78 |   thrust::device_vector<float> dev_train_losses(block_cnt_, 0.0f);
 79 |   thrust::device_vector<float> dev_vali_losses(block_cnt_, 0.0f);
 80 |   thrust::copy(cols, cols + num_cols, dev_cols.begin());
 81 |   thrust::copy(indptr, indptr + num_indptr + 1, dev_indptr.begin());
 82 |   thrust::copy(vali, vali + num_cols, dev_vali.begin());
 83 |   thrust::copy(counts, counts + num_cols, dev_counts.begin());
 84 |   thrust::copy(gamma, gamma + num_indptr * num_topics_, dev_gamma.begin());
 85 |   CHECK_CUDA(cudaDeviceSynchronize());
 86 |   DEBUG0("copy feed data to GPU memory");
 87 | 
 88 |   // run E step in GPU
 89 |   EstepKernel<<<block_cnt_, block_dim_, 
 90 |     4 * num_topics_ * sizeof(float)>>>(
 91 |     thrust::raw_pointer_cast(dev_cols.data()),
 92 |     thrust::raw_pointer_cast(dev_indptr.data()),
 93 |     thrust::raw_pointer_cast(dev_vali.data()),
 94 |     thrust::raw_pointer_cast(dev_counts.data()),
 95 |     init_gamma, num_cols, num_indptr, num_topics_, num_iters,
 96 |     thrust::raw_pointer_cast(dev_alpha_.data()),
 97 |     thrust::raw_pointer_cast(dev_beta_.data()),
 98 |     thrust::raw_pointer_cast(dev_gamma.data()),
 99 |     thrust::raw_pointer_cast(dev_grad_alpha_.data()),
100 |     thrust::raw_pointer_cast(dev_new_beta_.data()),
101 |     thrust::raw_pointer_cast(dev_train_losses.data()),
102 |     thrust::raw_pointer_cast(dev_vali_losses.data()),
103 |     thrust::raw_pointer_cast(dev_locks_.data()));
104 |   CHECK_CUDA(cudaDeviceSynchronize());
105 |   DEBUG0("run E step in GPU");
106 | 
107 |   // pull loss
108 |   std::vector<float> train_losses(block_cnt_), vali_losses(block_cnt_);
109 |   thrust::copy(dev_train_losses.begin(), dev_train_losses.end(), train_losses.begin());
110 |   thrust::copy(dev_vali_losses.begin(), dev_vali_losses.end(), vali_losses.begin());
111 |   thrust::copy(dev_gamma.begin(), dev_gamma.end(), gamma);
112 |   CHECK_CUDA(cudaDeviceSynchronize());
113 |   DEBUG0("pull loss values");
114 | 
115 |   // accumulate
116 |   float train_loss = std::accumulate(train_losses.begin(), train_losses.end(), 0.0f);
117 |   float vali_loss = std::accumulate(vali_losses.begin(), vali_losses.end(), 0.0f);
118 |   return {train_loss, vali_loss};
119 | }
120 | 
121 | void CuLDA::Pull() {
122 |   thrust::copy(dev_grad_alpha_.begin(), dev_grad_alpha_.end(), grad_alpha_);
123 |   thrust::copy(dev_new_beta_.begin(), dev_new_beta_.end(), new_beta_);
124 |   CHECK_CUDA(cudaDeviceSynchronize());
125 | }
126 | 
127 | void CuLDA::Push() {
128 |   thrust::copy(alpha_, alpha_ + num_topics_, dev_alpha_.begin());
129 |   thrust::copy(grad_alpha_, grad_alpha_ + block_cnt_ * num_topics_, dev_grad_alpha_.begin());
130 |   thrust::copy(beta_, beta_ + num_words_ * num_topics_, dev_beta_.begin());
131 |   thrust::copy(new_beta_, new_beta_ + num_words_ * num_topics_, dev_new_beta_.begin());
132 |   CHECK_CUDA(cudaDeviceSynchronize());
133 | }
134 | 
135 | int CuLDA::GetBlockCnt() {
136 |   return block_cnt_;
137 | }
138 | 
139 | }  // namespace cusim
140 | 


--------------------------------------------------------------------------------
/cpp/src/cuw2v/cuw2v.cu:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2021 Jisang Yoon
  2 | // All rights reserved.
  3 | //
  4 | // This source code is licensed under the Apache 2.0 license found in the
  5 | // LICENSE file in the root directory of this source tree.
  6 | #include "cuw2v/cuw2v.hpp"
  7 | #include "cuw2v/cuda_w2v_base_kernels.cuh"
  8 | #include "cuw2v/cuda_w2v_ns_kernels.cuh"
  9 | #include "cuw2v/cuda_w2v_hs_kernels.cuh"
 10 | 
 11 | namespace cusim {
 12 | 
 13 | struct HuffmanTreeNode {
 14 |   float count;
 15 |   int index, left, right;
 16 |   HuffmanTreeNode(float count0, int index0, int left0, int right0) {
 17 |     count = count0; index = index0; left = left0; right = right0;
 18 |   }
 19 | };
 20 | 
 21 | std::vector<HuffmanTreeNode> huffman_nodes;
 22 | bool CompareIndex(int lhs, int rhs) {
 23 |   return huffman_nodes[lhs].count > huffman_nodes[rhs].count;
 24 | }
 25 | 
 26 | CuW2V::CuW2V() {
 27 |   logger_container_.reset(new CuSimLogger("w2v"));
 28 |   logger_ = logger_container_->get_logger();
 29 |   dev_info_ = GetDeviceInfo();
 30 |   if (dev_info_.unknown) DEBUG0("Unknown device type");
 31 |   INFO("cuda device info, major: {}, minor: {}, multi processors: {}, cores: {}",
 32 |        dev_info_.major, dev_info_.minor, dev_info_.mp_cnt, dev_info_.cores);
 33 | }
 34 | 
 35 | CuW2V::~CuW2V() {}
 36 | 
 37 | bool CuW2V::Init(std::string opt_path) {
 38 |   std::ifstream in(opt_path.c_str());
 39 |   if (not in.is_open()) return false;
 40 | 
 41 |   std::string str((std::istreambuf_iterator<char>(in)),
 42 |       std::istreambuf_iterator<char>());
 43 |   std::string err_cmt;
 44 |   auto _opt = json11::Json::parse(str, err_cmt);
 45 |   if (not err_cmt.empty()) return false;
 46 |   opt_ = _opt;
 47 |   logger_container_->set_log_level(opt_["c_log_level"].int_value());
 48 |   num_dims_ = opt_["num_dims"].int_value();
 49 |   block_dim_ = opt_["block_dim"].int_value();
 50 |   block_cnt_ = opt_["hyper_threads"].number_value() * (dev_info_.cores / block_dim_);
 51 |   sg_ = opt_["skip_gram"].bool_value();
 52 |   cbow_mean_ = opt_["cbow_mean"].bool_value();
 53 |   window_size_ = opt_["window_size"].int_value();
 54 |   lr_ = opt_["lr"].number_value();
 55 | 
 56 |   // if zero, we will use hierarchical softmax
 57 |   neg_ = opt_["neg"].int_value(); 
 58 |   
 59 |   // random seed
 60 |   seed_ = opt_["seed"].int_value();
 61 |   dev_rngs_.resize(block_cnt_);
 62 |   InitRngsKernel<<<block_cnt_, 1>>>(
 63 |     thrust::raw_pointer_cast(dev_rngs_.data()), seed_);
 64 | 
 65 |   INFO("num_dims: {}, block_dim: {}, block_cnt: {}, objective type: {}, neg: {}", 
 66 |       num_dims_, block_dim_, block_cnt_, sg_? "skip gram": "cbow", neg_);
 67 |   return true;
 68 | }
 69 | 
 70 | void CuW2V::BuildRandomTable(const double* word_count, const int num_words, const int table_size) {
 71 |   num_words_ = num_words;
 72 |   std::vector<int> host_random_table;
 73 |   for (int i = 0; i < num_words; ++i) {
 74 |     int weight = std::max(1, static_cast<int>(word_count[i] * static_cast<double>(table_size)));
 75 |     for (int j = 0; j < weight; ++j)
 76 |       host_random_table.push_back(i);
 77 |   }
 78 |   
 79 |   random_size_ = host_random_table.size();
 80 |   dev_random_table_.resize(random_size_);
 81 |   thrust::copy(host_random_table.begin(), host_random_table.end(), dev_random_table_.begin());
 82 |   CHECK_CUDA(cudaDeviceSynchronize());
 83 |   
 84 |   INFO("random table initialzied, size: {} => {}", table_size, random_size_);
 85 | }
 86 | 
 87 | void CuW2V::BuildHuffmanTree(const float* word_count, const int num_words) {
 88 |   num_words_ = num_words;
 89 | 
 90 |   huffman_nodes.clear();
 91 |   std::priority_queue<int, std::vector<int>, decltype(&CompareIndex)> pq(CompareIndex);
 92 |   for (int i = 0; i < num_words; ++i) {
 93 |     huffman_nodes.emplace_back(word_count[i], i, -1, -1);
 94 |     pq.push(i);
 95 |   }
 96 |   for (int i = 0; i < num_words - 1; ++i) {
 97 |     auto& min1 = huffman_nodes[pq.top()]; pq.pop();
 98 |     auto& min2 = huffman_nodes[pq.top()]; pq.pop();
 99 |     huffman_nodes.emplace_back(min1.count + min2.count, i + num_words, min1.index, min2.index);
100 |     pq.push(i + num_words);
101 |   }
102 | 
103 |   std::vector<std::tuple<int, std::vector<bool>, std::vector<int>>> stack = {{pq.top(), {}, {}}};
104 |   int nodeid;
105 |   std::vector<bool> code;
106 |   std::vector<int> point;
107 |   std::vector<std::vector<bool>> codes(num_words);
108 |   std::vector<std::vector<int>> points(num_words);
109 |   max_depth_ = 0;
110 |   while (not stack.empty()) {
111 |     std::tie(nodeid, code, point) = stack.back();
112 |     stack.pop_back();
113 |     if (nodeid < num_words) {
114 |       codes[nodeid] = code;
115 |       points[nodeid] = point;
116 |       max_depth_ = std::max(max_depth_, 
117 |           static_cast<int>(code.size()));
118 |     } else {
119 |       point.push_back(nodeid - num_words);
120 |       std::vector<bool> left_code = code;
121 |       std::vector<bool> right_code = code;
122 |       left_code.push_back(false);
123 |       right_code.push_back(true);
124 |       auto& node = huffman_nodes[nodeid];
125 |       stack.push_back(make_tuple(node.left, left_code, point));
126 |       stack.push_back(make_tuple(node.right, right_code, point));
127 |     }
128 |   }
129 |   
130 |   std::vector<bool> host_codes;
131 |   std::vector<int> host_points;
132 |   std::vector<int> host_hs_indptr = {0};
133 |   int size = 0;
134 |   for (int i = 0; i < num_words; ++i) {
135 |     code = codes[i];
136 |     point = points[i];
137 |     int n = code.size();
138 |     size += n;
139 |     host_hs_indptr.push_back(size);
140 |     for (int j = 0; j < n; ++j) {
141 |       host_codes.push_back(code[j]);
142 |       host_points.push_back(point[j]);
143 |     }
144 |   }
145 |    
146 |   dev_codes_.resize(size); dev_points_.resize(size), dev_hs_indptr_.resize(num_words + 1);
147 |   thrust::copy(host_codes.begin(), host_codes.end(), dev_codes_.begin());
148 |   thrust::copy(host_points.begin(), host_points.end(), dev_points_.begin());
149 |   thrust::copy(host_hs_indptr.begin(), host_hs_indptr.end(), dev_hs_indptr_.begin());
150 |   CHECK_CUDA(cudaDeviceSynchronize());
151 |   
152 |   huffman_nodes.clear();
153 | }
154 | 
155 | void CuW2V::LoadModel(float* emb_in, float* emb_out) {
156 |   int out_words = neg_? num_words_: num_words_ - 1;
157 | 
158 |   // copy embedding
159 |   DEBUG("copy model({} x {})", num_words_, num_dims_);
160 |   dev_emb_in_.resize(num_words_ * num_dims_);
161 |   dev_emb_out_.resize(out_words * num_dims_);
162 |   thrust::copy(emb_in, emb_in + num_words_ * num_dims_, dev_emb_in_.begin());
163 |   thrust::copy(emb_out, emb_out + out_words * num_dims_, dev_emb_out_.begin());
164 |   emb_in_ = emb_in; emb_out_ = emb_out;
165 |   
166 |   CHECK_CUDA(cudaDeviceSynchronize());
167 | }
168 | 
169 | 
170 | std::pair<float, float> CuW2V::FeedData(const int* cols, const int* indptr, 
171 |     const int num_cols, const int num_indptr) {
172 |   
173 |   // copy feed data to GPU memory
174 |   thrust::device_vector<int> dev_cols(num_cols); 
175 |   thrust::device_vector<int> dev_indptr(num_indptr + 1);
176 |   thrust::device_vector<float> dev_loss_nume(block_cnt_, 0.0f);
177 |   thrust::device_vector<float> dev_loss_deno(block_cnt_, 0.0f);
178 |   thrust::copy(cols, cols + num_cols, dev_cols.begin());
179 |   thrust::copy(indptr, indptr + num_indptr + 1, dev_indptr.begin());
180 |   CHECK_CUDA(cudaDeviceSynchronize());
181 |   DEBUG0("copy feed data to GPU memory");
182 | 
183 |   // run GPU kernels
184 |   if (neg_ > 0) {
185 |     if (sg_) {
186 |       W2VNegSgKernel<<<block_cnt_, block_dim_, num_dims_ * sizeof(float)>>>(
187 |         thrust::raw_pointer_cast(dev_cols.data()),
188 |         thrust::raw_pointer_cast(dev_indptr.data()),
189 |         thrust::raw_pointer_cast(dev_random_table_.data()),
190 |         thrust::raw_pointer_cast(dev_rngs_.data()),
191 |         random_size_, num_indptr, num_dims_, neg_, window_size_,
192 |         thrust::raw_pointer_cast(dev_emb_in_.data()),
193 |         thrust::raw_pointer_cast(dev_emb_out_.data()),
194 |         thrust::raw_pointer_cast(dev_loss_nume.data()),
195 |         thrust::raw_pointer_cast(dev_loss_deno.data()),
196 |         lr_);
197 |     } else {
198 |       W2VNegCbowKernel<<<block_cnt_, block_dim_, 2 * num_dims_ * sizeof(float)>>>(
199 |         thrust::raw_pointer_cast(dev_cols.data()),
200 |         thrust::raw_pointer_cast(dev_indptr.data()),
201 |         thrust::raw_pointer_cast(dev_random_table_.data()),
202 |         thrust::raw_pointer_cast(dev_rngs_.data()),
203 |         random_size_, num_indptr, num_dims_, neg_, window_size_,
204 |         thrust::raw_pointer_cast(dev_emb_in_.data()),
205 |         thrust::raw_pointer_cast(dev_emb_out_.data()),
206 |         thrust::raw_pointer_cast(dev_loss_nume.data()),
207 |         thrust::raw_pointer_cast(dev_loss_deno.data()),
208 |         cbow_mean_, lr_);
209 |     }
210 |   } else {
211 |     if (sg_) {
212 |       W2VHsSgKernel<<<block_cnt_, block_dim_, num_dims_ * sizeof(float)>>>(
213 |         thrust::raw_pointer_cast(dev_cols.data()),
214 |         thrust::raw_pointer_cast(dev_indptr.data()),
215 |         thrust::raw_pointer_cast(dev_codes_.data()),
216 |         thrust::raw_pointer_cast(dev_points_.data()),
217 |         thrust::raw_pointer_cast(dev_hs_indptr_.data()),
218 |         num_indptr, num_dims_, window_size_,
219 |         thrust::raw_pointer_cast(dev_rngs_.data()),
220 |         thrust::raw_pointer_cast(dev_emb_in_.data()),
221 |         thrust::raw_pointer_cast(dev_emb_out_.data()),
222 |         thrust::raw_pointer_cast(dev_loss_nume.data()),
223 |         thrust::raw_pointer_cast(dev_loss_deno.data()),
224 |         lr_);
225 | 
226 |     } else {
227 |       W2VHsCbowKernel<<<block_cnt_, block_dim_, 2 * num_dims_ * sizeof(float)>>>(
228 |         thrust::raw_pointer_cast(dev_cols.data()),
229 |         thrust::raw_pointer_cast(dev_indptr.data()),
230 |         thrust::raw_pointer_cast(dev_codes_.data()),
231 |         thrust::raw_pointer_cast(dev_points_.data()),
232 |         thrust::raw_pointer_cast(dev_hs_indptr_.data()),
233 |         num_indptr, num_dims_, window_size_,
234 |         thrust::raw_pointer_cast(dev_rngs_.data()),
235 |         thrust::raw_pointer_cast(dev_emb_in_.data()),
236 |         thrust::raw_pointer_cast(dev_emb_out_.data()),
237 |         thrust::raw_pointer_cast(dev_loss_nume.data()),
238 |         thrust::raw_pointer_cast(dev_loss_deno.data()),
239 |         cbow_mean_, lr_);
240 | 
241 |     }
242 | 
243 |   }
244 |   CHECK_CUDA(cudaDeviceSynchronize());
245 | 
246 |   // accumulate loss nume / deno
247 |   std::vector<float> loss_nume(block_cnt_), loss_deno(block_cnt_);
248 |   thrust::copy(dev_loss_nume.begin(), dev_loss_nume.end(), loss_nume.begin());
249 |   thrust::copy(dev_loss_deno.begin(), dev_loss_deno.end(), loss_deno.begin());
250 |   CHECK_CUDA(cudaDeviceSynchronize());
251 |   float loss_nume_sum = std::accumulate(loss_nume.begin(), loss_nume.end(), 0.0f); 
252 |   float loss_deno_sum = std::accumulate(loss_deno.begin(), loss_deno.end(), 0.0f); 
253 |   DEBUG("loss nume: {}, deno: {}", loss_nume_sum, loss_deno_sum);
254 | 
255 |   return {loss_nume_sum, loss_deno_sum};
256 | }
257 | 
258 | void CuW2V::Pull() {
259 |   thrust::copy(dev_emb_in_.begin(), dev_emb_in_.end(), emb_in_);
260 |   thrust::copy(dev_emb_out_.begin(), dev_emb_out_.end(), emb_out_);
261 |   CHECK_CUDA(cudaDeviceSynchronize());
262 | }
263 | 
264 | }  // namespace cusim
265 | 


--------------------------------------------------------------------------------
/cpp/src/utils/ioutils.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2021 Jisang Yoon
  2 | // All rights reserved.
  3 | //
  4 | // This source code is licensed under the Apache 2.0 license found in the
  5 | // LICENSE file in the root directory of this source tree.
  6 | #include "utils/ioutils.hpp"
  7 | 
  8 | namespace cusim {
  9 | 
 10 | IoUtils::IoUtils() {
 11 |   logger_container_.reset(new CuSimLogger("ioutils"));
 12 |   logger_ = logger_container_->get_logger();
 13 | }
 14 | 
 15 | IoUtils::~IoUtils() {}
 16 | 
 17 | bool IoUtils::Init(std::string opt_path) {
 18 |   std::ifstream in(opt_path.c_str());
 19 |   if (not in.is_open()) return false;
 20 | 
 21 |   std::string str((std::istreambuf_iterator<char>(in)),
 22 |       std::istreambuf_iterator<char>());
 23 |   std::string err_cmt;
 24 |   auto _opt = json11::Json::parse(str, err_cmt);
 25 |   if (not err_cmt.empty()) return false;
 26 |   opt_ = _opt;
 27 |   logger_container_->set_log_level(opt_["c_log_level"].int_value());
 28 |   lower_ = opt_["lower"].bool_value();
 29 |   return true;
 30 | }
 31 | 
 32 | void IoUtils::ParseLine(std::string line, std::vector<std::string>& ret) {
 33 |   ParseLineImpl(line, ret);
 34 | }
 35 | 
 36 | 
 37 | void IoUtils::ParseLineImpl(std::string line, std::vector<std::string>& ret) {
 38 |   ret.clear();
 39 |   int n = line.size();
 40 |   std::string element;
 41 |   for (int i = 0; i < n; ++i) {
 42 |     if (line[i] == ' ') {
 43 |       ret.push_back(element);
 44 |       element.clear();
 45 |     } else {
 46 |       element += (lower_? std::tolower(line[i]): line[i]);
 47 |     }
 48 |   }
 49 |   if (element.size() > 0) {
 50 |     ret.push_back(element);
 51 |   }
 52 | }
 53 | 
 54 | int64_t IoUtils::LoadStreamFile(std::string filepath) {
 55 |   INFO("read stream file to generate vocabulary: {}", filepath);
 56 |   if (fin_.is_open()) fin_.close();
 57 |   fin_.open(filepath.c_str());
 58 |   int64_t count = 0;
 59 |   std::string line;
 60 |   while (getline(fin_, line))
 61 |     count++;
 62 |   fin_.close();
 63 |   fin_.open(filepath.c_str());
 64 |   num_lines_ = count;
 65 |   remain_lines_ = num_lines_;
 66 |   INFO("number of lines: {}", num_lines_);
 67 |   return count;
 68 | }
 69 | 
 70 | std::pair<int, int> IoUtils::TokenizeStream(int num_lines, int num_threads) {
 71 |   int read_lines = static_cast<int>(std::min(static_cast<int64_t>(num_lines), remain_lines_));
 72 |   if (not read_lines) return {0, 0};
 73 |   remain_lines_ -= read_lines;
 74 |   cols_.clear();
 75 |   cols_.resize(read_lines);
 76 |   indptr_.resize(read_lines);
 77 |   std::fill(indptr_.begin(), indptr_.end(), 0);
 78 |   #pragma omp parallel num_threads(num_threads)
 79 |   {
 80 |     std::string line;
 81 |     std::vector<std::string> line_vec;
 82 |     #pragma omp for schedule(dynamic, 4)
 83 |     for (int i = 0; i < read_lines; ++i) {
 84 |       // get line thread-safely
 85 |       {
 86 |         std::unique_lock<std::mutex> lock(global_lock_);
 87 |         getline(fin_, line);
 88 |       }
 89 | 
 90 |       // seems to be bottle-neck
 91 |       ParseLine(line, line_vec);
 92 | 
 93 |       // tokenize
 94 |       for (auto& word: line_vec) {
 95 |         if (not word_idmap_.count(word)) continue;
 96 |         cols_[i].push_back(word_idmap_[word]);
 97 |       }
 98 |     }
 99 |   }
100 |   int cumsum = 0;
101 |   for (int i = 0; i < read_lines; ++i) {
102 |     cumsum += cols_[i].size();
103 |     indptr_[i] = cumsum;
104 |   }
105 |   return {read_lines, indptr_[read_lines - 1]};
106 | }
107 | 
108 | void IoUtils::GetToken(int* rows, int* cols, int* indptr) {
109 |   int n = cols_.size();
110 |   for (int i = 0; i < n; ++i) {
111 |     int beg = i == 0? 0: indptr_[i - 1];
112 |     int end = indptr_[i];
113 |     for (int j = beg; j < end; ++j) {
114 |       rows[j] = i;
115 |       cols[j] = cols_[i][j - beg];
116 |     }
117 |     indptr[i] = indptr_[i];
118 |   }
119 | }
120 | 
121 | std::pair<int, int> IoUtils::ReadStreamForVocab(int num_lines, int num_threads) {
122 |   int read_lines = static_cast<int>(std::min(static_cast<int64_t>(num_lines), remain_lines_));
123 |   remain_lines_ -= read_lines;
124 |   #pragma omp parallel num_threads(num_threads)
125 |   {
126 |     std::string line;
127 |     std::vector<std::string> line_vec;
128 |     std::unordered_map<std::string, int> word_count;
129 |     #pragma omp for schedule(dynamic, 4)
130 |     for (int i = 0; i < read_lines; ++i) {
131 |       // get line thread-safely
132 |       {
133 |         std::unique_lock<std::mutex> lock(global_lock_);
134 |         getline(fin_, line);
135 |       }
136 | 
137 |       // seems to be bottle-neck
138 |       ParseLine(line, line_vec);
139 | 
140 |       // update private word count
141 |       for (auto& word: line_vec) {
142 |         word_count[word]++;
143 |       }
144 |     }
145 | 
146 |     // update word count to class variable
147 |     {
148 |       std::unique_lock<std::mutex> lock(global_lock_);
149 |       for (auto& it: word_count) {
150 |         word_count_[it.first] += it.second;
151 |       }
152 |     }
153 |   }
154 |   if (not remain_lines_) fin_.close();
155 |   return {read_lines, word_count_.size()};
156 | }
157 | 
158 | void IoUtils::GetWordVocab(int min_count, std::string keys_path, std::string count_path) {
159 |   INFO("number of raw words: {}", word_count_.size());
160 |   word_idmap_.clear(); word_list_.clear();
161 |   for (auto& it: word_count_) {
162 |     if (it.second >= min_count) {
163 |       word_idmap_[it.first] = word_idmap_.size();
164 |       word_list_.push_back(it.first);
165 |     }
166 |   }
167 |   INFO("number of words after filtering: {}", word_list_.size());
168 | 
169 |   // write keys and count to csv file
170 |   std::ofstream fout1(keys_path.c_str());
171 |   std::ofstream fout2(count_path.c_str());
172 |   INFO("dump keys to {}", keys_path);
173 |   int n = word_list_.size();
174 |   for (int i = 0; i < n; ++i) {
175 |     std::string line = word_list_[i] + "\n";
176 |     fout1.write(line.c_str(), line.size());
177 |     line = std::to_string(word_count_[word_list_[i]]) + "\n";
178 |     fout2.write(line.c_str(), line.size());
179 |   }
180 |   fout1.close(); fout2.close();
181 | }
182 | 
183 | std::tuple<int64_t, int, int64_t> IoUtils::ReadBagOfWordsHeader(std::string filepath) {
184 |   INFO("read bag of words file: {} (format reference: https://archive.ics.uci.edu/ml/datasets/bag+of+words)",
185 |       filepath);
186 |   if (fin_.is_open()) fin_.close();
187 |   fin_.open(filepath.c_str());
188 |   std::string line;
189 |   std::stringstream sstr;
190 |   int64_t num_docs, nnz;
191 |   int num_words;
192 |   getline(fin_, line);
193 |   sstr << line; sstr >> num_docs; sstr.clear();
194 |   getline(fin_, line);
195 |   num_words = std::stoi(line);
196 |   getline(fin_, line);
197 |   sstr << line; sstr >> nnz; sstr.clear();
198 |   return {num_docs, num_words, nnz};
199 | }
200 | 
201 | void IoUtils::ReadBagOfWordsContent(int64_t* rows, int* cols, float* counts, const int num_lines) {
202 |   if (not fin_.is_open()) throw std::runtime_error("file is not open");
203 |   std::string line;
204 |   std::stringstream sstr;
205 |   int64_t row;
206 |   int col;
207 |   float count;
208 |   std::vector<std::string> line_vec;
209 |   for (int i = 0; i < num_lines; ++i) {
210 |     getline(fin_, line);
211 |     ParseLine(line, line_vec);
212 |     sstr << line_vec[0]; sstr >> row; sstr.clear();
213 |     col = std::stoi(line_vec[1]);
214 |     count = std::stof(line_vec[2]);
215 |     rows[i] = row - 1; cols[i] = col - 1; counts[i] = count;
216 |     line_vec.clear();
217 |   }
218 |   if (fin_.eof()) fin_.close();
219 | }
220 | 
221 | }  // namespace cusim
222 | 


--------------------------------------------------------------------------------
/cpp/src/utils/log.cc:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020 Jisang Yoon
 2 | // All rights reserved.
 3 | //
 4 | // This source code is licensed under the Apache 2.0 license found in the
 5 | // LICENSE file in the root directory of this source tree.
 6 | 
 7 | // reference: https://github.com/kakao/buffalo/blob/5f571c2c7d8227e6625c6e538da929e4db11b66d/lib/misc/log.cc
 8 | #include "utils/log.hpp"
 9 | 
10 | 
11 | namespace cusim {
12 | int CuSimLogger::global_logging_level_ = 2;
13 | 
14 | CuSimLogger::CuSimLogger() {
15 |   spdlog::set_pattern("[%^%-8l%$] %Y-%m-%d %H:%M:%S %v");
16 |   logger_ = spdlog::default_logger();
17 | }
18 | 
19 | CuSimLogger::CuSimLogger(std::string name) {
20 |   // auto console_sink = std::make_shared<spdlog::sinks::stdout_color_sink_mt>();
21 |   auto stderr_sink = std::make_shared<spdlog::sinks::stderr_color_sink_mt>();
22 |   // spdlog::sinks_init_list sinks = {console_sink, stderr_sink};
23 |   logger_ = std::make_shared<spdlog::logger>(name, stderr_sink);
24 |   logger_->set_pattern("[%^%-8l%$] %Y-%m-%d %H:%M:%S %v");
25 | }
26 | 
27 | std::shared_ptr<spdlog::logger>& CuSimLogger::get_logger() {
28 |   return logger_;
29 | }
30 | 
31 | void CuSimLogger::set_log_level(int level) {
32 |   global_logging_level_ = level;
33 |   switch (level) {
34 |     case 0: logger_->set_level(spdlog::level::off); break;
35 |     case 1: logger_->set_level(spdlog::level::warn); break;
36 |     case 2: logger_->set_level(spdlog::level::info); break;
37 |     case 3: logger_->set_level(spdlog::level::debug); break;
38 |     default: logger_->set_level(spdlog::level::trace); break;
39 |   }
40 | }
41 | 
42 | int CuSimLogger::get_log_level() {
43 |   return global_logging_level_;
44 | }
45 | 
46 | }  // namespace cusim
47 | 


--------------------------------------------------------------------------------
/cuda_setup.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Jisang Yoon
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the Apache 2.0 license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | # Adapted from https://github.com/rmcgibbo/npcuda-example and
  8 | # https://github.com/cupy/cupy/blob/master/cupy_setup_build.py
  9 | # pylint: disable=fixme,access-member-before-definition
 10 | # pylint: disable=attribute-defined-outside-init,arguments-differ
 11 | import logging
 12 | import os
 13 | import sys
 14 | 
 15 | from distutils import ccompiler, errors, msvccompiler, unixccompiler
 16 | from setuptools.command.build_ext import build_ext as setuptools_build_ext
 17 | 
 18 | HALF_PRECISION = False
 19 | 
 20 | def find_in_path(name, path):
 21 |   "Find a file in a search path"
 22 |   # adapted fom http://code.activestate.com/
 23 |   # recipes/52224-find-a-file-given-a-search-path/
 24 |   for _dir in path.split(os.pathsep):
 25 |     binpath = os.path.join(_dir, name)
 26 |     if os.path.exists(binpath):
 27 |       return os.path.abspath(binpath)
 28 |   return None
 29 | 
 30 | # reference: https://arnon.dk/
 31 | # matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
 32 | def get_cuda_sm_list(cuda_ver):
 33 |   if "CUDA_SM_LIST" in os.environ:
 34 |     sm_list = os.environ["CUDA_SM_LIST"].split(",")
 35 |   else:
 36 |     sm_list = ["30", "52", "60", "61", "70", "75", "80", "86"]
 37 |     if cuda_ver >= 110:
 38 |       filter_list = ["30"]
 39 |       if cuda_ver == 110:
 40 |         filter_list += ["86"]
 41 |     else:
 42 |       filter_list = ["80", "86"]
 43 |       if cuda_ver < 100:
 44 |         filter_list += ["75"]
 45 |       if cuda_ver < 90:
 46 |         filter_list += ["70"]
 47 |       if cuda_ver < 80:
 48 |         filter_list += ["60", "61"]
 49 |     sm_list = [sm for sm in sm_list if sm not in filter_list]
 50 |   return sm_list
 51 | 
 52 | 
 53 | def get_cuda_compute(cuda_ver):
 54 |   if "CUDA_COMPUTE" in os.environ:
 55 |     compute = os.environ["CUDA_COMPUTE"]
 56 |   else:
 57 |     if 70 <= cuda_ver < 80:
 58 |       compute = "52"
 59 |     if 80 <= cuda_ver < 90:
 60 |       compute = "61"
 61 |     if 90 <= cuda_ver < 100:
 62 |       compute = "70"
 63 |     if 100 <= cuda_ver < 110:
 64 |       compute = "75"
 65 |     if cuda_ver == 110:
 66 |       compute = "80"
 67 |     if cuda_ver == 111:
 68 |       compute = "86"
 69 |   return compute
 70 | 
 71 | 
 72 | def get_cuda_arch(cuda_ver):
 73 |   if "CUDA_ARCH" in os.environ:
 74 |     arch = os.environ["CUDA_ARCH"]
 75 |   else:
 76 |     if 70 <= cuda_ver < 92:
 77 |       arch = "30"
 78 |     if 92 <= cuda_ver < 110:
 79 |       arch = "50"
 80 |     if cuda_ver == 110:
 81 |       arch = "52"
 82 |     if cuda_ver == 111:
 83 |       arch = "80"
 84 |   return arch
 85 | 
 86 | def locate_cuda():
 87 |   """Locate the CUDA environment on the system
 88 |   If a valid cuda installation is found
 89 |   this returns a dict with keys 'home', 'nvcc', 'include',
 90 |   and 'lib64' and values giving the absolute path to each directory.
 91 |   Starts by looking for the CUDAHOME env variable.
 92 |   If not found, everything is based on finding
 93 |   'nvcc' in the PATH.
 94 |   If nvcc can't be found, this returns None
 95 |   """
 96 |   nvcc_bin = 'nvcc'
 97 |   if sys.platform.startswith("win"):
 98 |     nvcc_bin = 'nvcc.exe'
 99 | 
100 |   # check env variables CUDA_HOME, CUDAHOME, CUDA_PATH.
101 |   found = False
102 |   for env_name in ['CUDA_PATH', 'CUDAHOME', 'CUDA_HOME']:
103 |     if env_name not in os.environ:
104 |       continue
105 |     found = True
106 |     home = os.environ[env_name]
107 |     nvcc = os.path.join(home, 'bin', nvcc_bin)
108 |     break
109 |   if not found:
110 |     # otherwise, search the PATH for NVCC
111 |     nvcc = find_in_path(nvcc_bin, os.environ['PATH'])
112 |     if nvcc is None:
113 |       logging.warning('The nvcc binary could not be located in your '
114 |               '$PATH. Either add it to '
115 |               'your path, or set $CUDA_HOME to enable CUDA extensions')
116 |       return None
117 |     home = os.path.dirname(os.path.dirname(nvcc))
118 |   cudaconfig = {'home': home,
119 |                 'nvcc': nvcc,
120 |                 'include': os.path.join(home, 'include'),
121 |                 'lib64':   os.path.join(home, 'lib64')}
122 |   cuda_ver = os.path.basename(os.path.realpath(home)).split("-")[1].split(".")
123 |   major, minor = int(cuda_ver[0]), int(cuda_ver[1])
124 |   cuda_ver = 10 * major + minor
125 |   assert cuda_ver >= 70, f"too low cuda ver {major}.{minor}"
126 |   print(f"cuda_ver: {major}.{minor}")
127 |   arch = get_cuda_arch(cuda_ver)
128 |   sm_list = get_cuda_sm_list(cuda_ver)
129 |   compute = get_cuda_compute(cuda_ver)
130 |   post_args = [f"-arch=sm_{arch}"] + \
131 |     [f"-gencode=arch=compute_{sm},code=sm_{sm}" for sm in sm_list] + \
132 |     [f"-gencode=arch=compute_{compute},code=compute_{compute}",
133 |      "--ptxas-options=-v", "-O2"]
134 |   print(f"nvcc post args: {post_args}")
135 |   if HALF_PRECISION:
136 |     post_args = [flag for flag in post_args if "52" not in flag]
137 | 
138 |   if sys.platform == "win32":
139 |     cudaconfig['lib64'] = os.path.join(home, 'lib', 'x64')
140 |     post_args += ['-Xcompiler', '/MD', '-std=c++14',  "-Xcompiler", "/openmp"]
141 |     if HALF_PRECISION:
142 |       post_args += ["-Xcompiler", "/D HALF_PRECISION"]
143 |   else:
144 |     post_args += ['-c', '--compiler-options', "'-fPIC'",
145 |                   "--compiler-options", "'-std=c++14'"]
146 |     if HALF_PRECISION:
147 |       post_args += ["--compiler-options", "'-D HALF_PRECISION'"]
148 |   for k, val in cudaconfig.items():
149 |     if not os.path.exists(val):
150 |       logging.warning('The CUDA %s path could not be located in %s', k, val)
151 |       return None
152 | 
153 |   cudaconfig['post_args'] = post_args
154 |   return cudaconfig
155 | 
156 | 
157 | # This code to build .cu extensions with nvcc is taken from cupy:
158 | # https://github.com/cupy/cupy/blob/master/cupy_setup_build.py
159 | class _UnixCCompiler(unixccompiler.UnixCCompiler):
160 |   src_extensions = list(unixccompiler.UnixCCompiler.src_extensions)
161 |   src_extensions.append('.cu')
162 | 
163 |   def _compile(self, obj, src, ext, cc_args, extra_postargs, pp_opts):
164 |     # For sources other than CUDA C ones, just call the super class method.
165 |     if os.path.splitext(src)[1] != '.cu':
166 |       return unixccompiler.UnixCCompiler._compile(
167 |         self, obj, src, ext, cc_args, extra_postargs, pp_opts)
168 | 
169 |     # For CUDA C source files, compile them with NVCC.
170 |     _compiler_so = self.compiler_so
171 |     try:
172 |       nvcc_path = CUDA['nvcc']
173 |       post_args = CUDA['post_args']
174 |       # TODO? base_opts = build.get_compiler_base_options()
175 |       self.set_executable('compiler_so', nvcc_path)
176 | 
177 |       return unixccompiler.UnixCCompiler._compile(
178 |         self, obj, src, ext, cc_args, post_args, pp_opts)
179 |     finally:
180 |       self.compiler_so = _compiler_so
181 | 
182 | 
183 | class _MSVCCompiler(msvccompiler.MSVCCompiler):
184 |   _cu_extensions = ['.cu']
185 | 
186 |   src_extensions = list(unixccompiler.UnixCCompiler.src_extensions)
187 |   src_extensions.extend(_cu_extensions)
188 | 
189 |   def _compile_cu(self, sources, output_dir=None, macros=None,
190 |           include_dirs=None, debug=0, extra_preargs=None,
191 |           extra_postargs=None, depends=None):
192 |     # Compile CUDA C files, mainly derived from UnixCCompiler._compile().
193 |     macros, objects, extra_postargs, pp_opts, _build = \
194 |       self._setup_compile(output_dir, macros, include_dirs, sources,
195 |                 depends, extra_postargs)
196 | 
197 |     compiler_so = CUDA['nvcc']
198 |     cc_args = self._get_cc_args(pp_opts, debug, extra_preargs)
199 |     post_args = CUDA['post_args']
200 | 
201 |     for obj in objects:
202 |       try:
203 |         src, _ = _build[obj]
204 |       except KeyError:
205 |         continue
206 |       try:
207 |         self.spawn([compiler_so] + cc_args + [src, '-o', obj] + post_args)
208 |       except errors.DistutilsExecError as e:
209 |         raise errors.CompileError(str(e))
210 | 
211 |     return objects
212 | 
213 |   def compile(self, sources, **kwargs):
214 |     # Split CUDA C sources and others.
215 |     cu_sources = []
216 |     other_sources = []
217 |     for source in sources:
218 |       if os.path.splitext(source)[1] == '.cu':
219 |         cu_sources.append(source)
220 |       else:
221 |         other_sources.append(source)
222 | 
223 |     # Compile source files other than CUDA C ones.
224 |     other_objects = msvccompiler.MSVCCompiler.compile(
225 |       self, other_sources, **kwargs)
226 | 
227 |     # Compile CUDA C sources.
228 |     cu_objects = self._compile_cu(cu_sources, **kwargs)
229 | 
230 |     # Return compiled object filenames.
231 |     return other_objects + cu_objects
232 | 
233 | 
234 | class CudaBuildExt(setuptools_build_ext):
235 |   """Custom `build_ext` command to include CUDA C source files."""
236 | 
237 |   def run(self):
238 |     if CUDA is not None:
239 |       def wrap_new_compiler(func):
240 |         def _wrap_new_compiler(*args, **kwargs):
241 |           try:
242 |             return func(*args, **kwargs)
243 |           except errors.DistutilsPlatformError:
244 |             if sys.platform != 'win32':
245 |               CCompiler = _UnixCCompiler
246 |             else:
247 |               CCompiler = _MSVCCompiler
248 |             return CCompiler(
249 |               None, kwargs['dry_run'], kwargs['force'])
250 |         return _wrap_new_compiler
251 |       ccompiler.new_compiler = wrap_new_compiler(ccompiler.new_compiler)
252 |       # Intentionally causes DistutilsPlatformError in
253 |       # ccompiler.new_compiler() function to hook.
254 |       self.compiler = 'nvidia'
255 | 
256 |     setuptools_build_ext.run(self)
257 | 
258 | 
259 | CUDA = locate_cuda()
260 | assert CUDA is not None
261 | BUILDEXT = CudaBuildExt if CUDA else setuptools_build_ext
262 | 


--------------------------------------------------------------------------------
/cusim/.gitignore:
--------------------------------------------------------------------------------
1 | config_pb2.py
2 | 


--------------------------------------------------------------------------------
/cusim/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 Jisang Yoon
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the Apache 2.0 license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | from cusim.ioutils import IoUtils
7 | from cusim.culda import CuLDA
8 | from cusim.cuw2v import CuW2V
9 | 


--------------------------------------------------------------------------------
/cusim/aux.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 Jisang Yoon
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the Apache 2.0 license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | import os
  7 | import re
  8 | import sys
  9 | import json
 10 | import time
 11 | import logging
 12 | import logging.handlers
 13 | import numpy as np
 14 | import jsmin
 15 | from google.protobuf.json_format import Parse, MessageToDict
 16 | 
 17 | # get_logger and Option refer to
 18 | # https://github.com/kakao/buffalo/blob/
 19 | # 5f571c2c7d8227e6625c6e538da929e4db11b66d/buffalo/misc/aux.py
 20 | def get_logger(name=__file__, level=2):
 21 |   if level == 1:
 22 |     level = logging.WARNING
 23 |   elif level == 2:
 24 |     level = logging.INFO
 25 |   elif level == 3:
 26 |     level = logging.DEBUG
 27 |   logger = logging.getLogger(name)
 28 |   if logger.handlers:
 29 |     return logger
 30 |   logger.setLevel(level)
 31 |   sh0 = logging.StreamHandler()
 32 |   sh0.setLevel(level)
 33 |   formatter = logging.Formatter('[%(levelname)-8s] %(asctime)s '
 34 |                                 '[%(filename)s] [%(funcName)s:%(lineno)d]'
 35 |                                 '%(message)s', '%Y-%m-%d %H:%M:%S')
 36 |   sh0.setFormatter(formatter)
 37 |   logger.addHandler(sh0)
 38 |   return logger
 39 | 
 40 | # This function helps you to read non-standard json strings.
 41 | # - Handles json string with c++ style inline comments
 42 | # - Handles json string with trailing commas.
 43 | def load_json_string(cont):
 44 |   # (1) Removes comment.
 45 |   #     Refer to https://plus.google.com/+DouglasCrockfordEsq/posts/RK8qyGVaGSr
 46 |   cont = jsmin.jsmin(cont)
 47 | 
 48 |   # (2) Removes trailing comma.
 49 |   cont = re.sub(",[ \t\r\n]*}", "}", cont)
 50 |   cont = re.sub(",[ \t\r\n]*" + r"\]", "]", cont)
 51 | 
 52 |   return json.loads(cont)
 53 | 
 54 | 
 55 | # function read json file from filename
 56 | def load_json_file(fname):
 57 |   with open(fname, "r") as fin:
 58 |     ret = load_json_string(fin.read())
 59 |   return ret
 60 | 
 61 | # use protobuf to restrict field and types
 62 | def get_opt_as_proto(raw, proto_type=None):
 63 |   assert proto_type is not None
 64 |   proto = proto_type()
 65 |   # convert raw to proto
 66 |   Parse(json.dumps(Option(raw)), proto)
 67 |   err = []
 68 |   assert proto.IsInitialized(err), \
 69 |     f"some required fields are missing in proto {err}\n {proto}"
 70 |   return proto
 71 | 
 72 | def proto_to_dict(proto):
 73 |   return MessageToDict(proto, \
 74 |     including_default_value_fields=True, \
 75 |     preserving_proto_field_name=True)
 76 | 
 77 | def copy_proto(proto):
 78 |   newproto = type(proto)()
 79 |   Parse(json.dumps(proto_to_dict(proto)), newproto)
 80 |   return newproto
 81 | 
 82 | class Option(dict):
 83 |   def __init__(self, *args, **kwargs):
 84 |     args = [arg if isinstance(arg, dict)
 85 |             else load_json_file(arg) for arg in args]
 86 |     super().__init__(*args, **kwargs)
 87 |     for arg in args:
 88 |       if isinstance(arg, dict):
 89 |         for k, val in arg.items():
 90 |           if isinstance(val, dict):
 91 |             self[k] = Option(val)
 92 |           else:
 93 |             self[k] = val
 94 |     if kwargs:
 95 |       for k, val in kwargs.items():
 96 |         if isinstance(val, dict):
 97 |           self[k] = Option(val)
 98 |         else:
 99 |           self[k] = val
100 | 
101 |   def __getattr__(self, attr):
102 |     return self.get(attr)
103 | 
104 |   def __setattr__(self, key, value):
105 |     self.__setitem__(key, value)
106 | 
107 |   def __setitem__(self, key, value):
108 |     super().__setitem__(key, value)
109 |     self.__dict__.update({key: value})
110 | 
111 |   def __delattr__(self, item):
112 |     self.__delitem__(item)
113 | 
114 |   def __delitem__(self, key):
115 |     super().__delitem__(key)
116 |     del self.__dict__[key]
117 | 
118 |   def __getstate__(self):
119 |     return vars(self)
120 | 
121 |   def __setstate__(self, state):
122 |     vars(self).update(state)
123 | 
124 | # reference: https://github.com/tensorflow/tensorflow/blob/
125 | # 85c8b2a817f95a3e979ecd1ed95bff1dc1335cff/tensorflow/python/
126 | # keras/utils/generic_utils.py#L483
127 | class Progbar:
128 |   # pylint: disable=too-many-branches,too-many-statements,invalid-name
129 |   # pylint: disable=blacklisted-name,no-else-return
130 |   """Displays a progress bar.
131 |   Arguments:
132 |       target: Total number of steps expected, None if unknown.
133 |       width: Progress bar width on screen.
134 |       verbose: Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose)
135 |       stateful_metrics: Iterable of string names of metrics that should *not* be
136 |         averaged over time. Metrics in this list will be displayed as-is. All
137 |         others will be averaged by the progbar before display.
138 |       interval: Minimum visual progress update interval (in seconds).
139 |       unit_name: Display name for step counts (usually "step" or "sample").
140 |   """
141 | 
142 |   def __init__(self,
143 |                target,
144 |                width=30,
145 |                verbose=1,
146 |                interval=0.05,
147 |                stateful_metrics=None,
148 |                unit_name='step'):
149 |     self.target = target
150 |     self.width = width
151 |     self.verbose = verbose
152 |     self.interval = interval
153 |     self.unit_name = unit_name
154 |     if stateful_metrics:
155 |       self.stateful_metrics = set(stateful_metrics)
156 |     else:
157 |       self.stateful_metrics = set()
158 | 
159 |     self._dynamic_display = ((hasattr(sys.stdout, 'isatty') and
160 |                               sys.stdout.isatty()) or
161 |                              'ipykernel' in sys.modules or
162 |                              'posix' in sys.modules or
163 |                              'PYCHARM_HOSTED' in os.environ)
164 |     self._total_width = 0
165 |     self._seen_so_far = 0
166 |     # We use a dict + list to avoid garbage collection
167 |     # issues found in OrderedDict
168 |     self._values = {}
169 |     self._values_order = []
170 |     self._start = time.time()
171 |     self._last_update = 0
172 | 
173 |     self._time_after_first_step = None
174 | 
175 |   def update(self, current, values=None, finalize=None):
176 |     """Updates the progress bar.
177 |     Arguments:
178 |         current: Index of current step.
179 |         values: List of tuples: `(name, value_for_last_step)`. If `name` is in
180 |           `stateful_metrics`, `value_for_last_step` will be displayed as-is.
181 |           Else, an average of the metric over time will be displayed.
182 |         finalize: Whether this is the last update for the progress bar. If
183 |           `None`, defaults to `current >= self.target`.
184 |     """
185 |     if finalize is None:
186 |       if self.target is None:
187 |         finalize = False
188 |       else:
189 |         finalize = current >= self.target
190 | 
191 |     values = values or []
192 |     for k, v in values:
193 |       if k not in self._values_order:
194 |         self._values_order.append(k)
195 |       if k not in self.stateful_metrics:
196 |         # In the case that progress bar doesn't have a target value in the first
197 |         # epoch, both on_batch_end and on_epoch_end will be called, which will
198 |         # cause 'current' and 'self._seen_so_far' to have the same value. Force
199 |         # the minimal value to 1 here, otherwise stateful_metric will be 0s.
200 |         value_base = max(current - self._seen_so_far, 1)
201 |         if k not in self._values:
202 |           self._values[k] = [v * value_base, value_base]
203 |         else:
204 |           self._values[k][0] += v * value_base
205 |           self._values[k][1] += value_base
206 |       else:
207 |         # Stateful metrics output a numeric value. This representation
208 |         # means "take an average from a single value" but keeps the
209 |         # numeric formatting.
210 |         self._values[k] = [v, 1]
211 |     self._seen_so_far = current
212 | 
213 |     now = time.time()
214 |     info = ' - %.0fs' % (now - self._start)
215 |     if self.verbose == 1:
216 |       if now - self._last_update < self.interval and not finalize:
217 |         return
218 | 
219 |       prev_total_width = self._total_width
220 |       if self._dynamic_display:
221 |         sys.stdout.write('\b' * prev_total_width)
222 |         sys.stdout.write('\r')
223 |       else:
224 |         sys.stdout.write('\n')
225 | 
226 |       if self.target is not None:
227 |         numdigits = int(np.log10(self.target)) + 1
228 |         bar = ('%' + str(numdigits) + 'd/%d [') % (current, self.target)
229 |         prog = float(current) / self.target
230 |         prog_width = int(self.width * prog)
231 |         if prog_width > 0:
232 |           bar += ('=' * (prog_width - 1))
233 |           if current < self.target:
234 |             bar += '>'
235 |           else:
236 |             bar += '='
237 |         bar += ('.' * (self.width - prog_width))
238 |         bar += ']'
239 |       else:
240 |         bar = '%7d/Unknown' % current
241 | 
242 |       self._total_width = len(bar)
243 |       sys.stdout.write(bar)
244 | 
245 |       time_per_unit = self._estimate_step_duration(current, now)
246 | 
247 |       if self.target is None or finalize:
248 |         if time_per_unit >= 1 or time_per_unit == 0:
249 |           info += ' %.0fs/%s' % (time_per_unit, self.unit_name)
250 |         elif time_per_unit >= 1e-3:
251 |           info += ' %.0fms/%s' % (time_per_unit * 1e3, self.unit_name)
252 |         else:
253 |           info += ' %.0fus/%s' % (time_per_unit * 1e6, self.unit_name)
254 |       else:
255 |         eta = time_per_unit * (self.target - current)
256 |         if eta > 3600:
257 |           eta_format = '%d:%02d:%02d' % (eta // 3600,
258 |                                          (eta % 3600) // 60, eta % 60)
259 |         elif eta > 60:
260 |           eta_format = '%d:%02d' % (eta // 60, eta % 60)
261 |         else:
262 |           eta_format = '%ds' % eta
263 | 
264 |         info = ' - ETA: %s' % eta_format
265 | 
266 |       for k in self._values_order:
267 |         info += ' - %s:' % k
268 |         if isinstance(self._values[k], list):
269 |           avg = np.mean(self._values[k][0] / max(1, self._values[k][1]))
270 |           if abs(avg) > 1e-3:
271 |             info += ' %.4f' % avg
272 |           else:
273 |             info += ' %.4e' % avg
274 |         else:
275 |           info += ' %s' % self._values[k]
276 | 
277 |       self._total_width += len(info)
278 |       if prev_total_width > self._total_width:
279 |         info += (' ' * (prev_total_width - self._total_width))
280 | 
281 |       if finalize:
282 |         info += '\n'
283 | 
284 |       sys.stdout.write(info)
285 |       sys.stdout.flush()
286 | 
287 |     elif self.verbose == 2:
288 |       if finalize:
289 |         numdigits = int(np.log10(self.target)) + 1
290 |         count = ('%' + str(numdigits) + 'd/%d') % (current, self.target)
291 |         info = count + info
292 |         for k in self._values_order:
293 |           info += ' - %s:' % k
294 |           avg = np.mean(self._values[k][0] / max(1, self._values[k][1]))
295 |           if avg > 1e-3:
296 |             info += ' %.4f' % avg
297 |           else:
298 |             info += ' %.4e' % avg
299 |         info += '\n'
300 | 
301 |         sys.stdout.write(info)
302 |         sys.stdout.flush()
303 | 
304 |     self._last_update = now
305 | 
306 |   def add(self, n, values=None):
307 |     self.update(self._seen_so_far + n, values)
308 | 
309 |   def _estimate_step_duration(self, current, now):
310 |     """Estimate the duration of a single step.
311 |     Given the step number `current` and the corresponding time `now`
312 |     this function returns an estimate for how long a single step
313 |     takes. If this is called before one step has been completed
314 |     (i.e. `current == 0`) then zero is given as an estimate. The duration
315 |     estimate ignores the duration of the (assumed to be non-representative)
316 |     first step for estimates when more steps are available (i.e. `current>1`).
317 |     Arguments:
318 |       current: Index of current step.
319 |       now: The current time.
320 |     Returns: Estimate of the duration of a single step.
321 |     """
322 |     if current:
323 |       # there are a few special scenarios here:
324 |       # 1) somebody is calling the progress bar without ever supplying step 1
325 |       # 2) somebody is calling the progress bar and supplies step one mulitple
326 |       #    times, e.g. as part of a finalizing call
327 |       # in these cases, we just fall back to the simple calculation
328 |       if self._time_after_first_step is not None and current > 1:
329 |         time_per_unit = (now - self._time_after_first_step) / (current - 1)
330 |       else:
331 |         time_per_unit = (now - self._start) / current
332 | 
333 |       if current == 1:
334 |         self._time_after_first_step = now
335 |       return time_per_unit
336 |     else:
337 |       return 0
338 | 


--------------------------------------------------------------------------------
/cusim/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2021 Jisang Yoon
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the Apache 2.0 license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # pylint: disable=no-name-in-module,too-few-public-methods,no-member
 8 | 
 9 | EPS = 1e-10
10 | WARP_SIZE = 32
11 | 


--------------------------------------------------------------------------------
/cusim/culda/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 Jisang Yoon
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the Apache 2.0 license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | from cusim.culda.pyculda import CuLDA
7 | 


--------------------------------------------------------------------------------
/cusim/culda/bindings.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2021 Jisang Yoon
  2 | //  All rights reserved.
  3 | //
  4 | //  This source code is licensed under the Apache 2.0 license found in the
  5 | //  LICENSE file in the root directory of this source tree.
  6 | #include <pybind11/pybind11.h>
  7 | #include <pybind11/numpy.h>
  8 | #include <pybind11/stl.h>
  9 | 
 10 | #include <iostream>
 11 | #include "culda/culda.hpp"
 12 | 
 13 | namespace py = pybind11;
 14 | 
 15 | typedef py::array_t<float, py::array::c_style | py::array::forcecast> float_array;
 16 | typedef py::array_t<int, py::array::c_style | py::array::forcecast> int_array;
 17 | typedef py::array_t<bool, py::array::c_style | py::array::forcecast> bool_array;
 18 | 
 19 | class CuLDABind {
 20 |  public:
 21 |   CuLDABind() {}
 22 | 
 23 |   bool Init(std::string opt_path) {
 24 |     return obj_.Init(opt_path);
 25 |   }
 26 | 
 27 |   void LoadModel(py::object& alpha, py::object& beta,
 28 |       py::object& grad_alpha, py::object& new_beta) {
 29 |     // check shape of alpha and beta
 30 |     float_array _alpha(alpha);
 31 |     float_array _beta(beta);
 32 |     auto alpha_buffer = _alpha.request();
 33 |     auto beta_buffer = _beta.request();
 34 |     if (alpha_buffer.ndim != 1 or beta_buffer.ndim != 2 or
 35 |         alpha_buffer.shape[0] != beta_buffer.shape[1]) {
 36 |       throw std::runtime_error("invalid alpha or beta");
 37 |     }
 38 | 
 39 |     // check shape of grad alpha and new beta
 40 |     float_array _grad_alpha(grad_alpha);
 41 |     float_array _new_beta(new_beta);
 42 |     auto grad_alpha_buffer = _grad_alpha.request();
 43 |     auto new_beta_buffer = _new_beta.request();
 44 |     if (grad_alpha_buffer.ndim != 2 or
 45 |         new_beta_buffer.ndim != 2 or
 46 |         grad_alpha_buffer.shape[1] != new_beta_buffer.shape[1]) {
 47 |       throw std::runtime_error("invalid grad_alpha or new_beta");
 48 |     }
 49 | 
 50 |     int num_words = beta_buffer.shape[0];
 51 | 
 52 |     return obj_.LoadModel(_alpha.mutable_data(0),
 53 |         _beta.mutable_data(0),
 54 |         _grad_alpha.mutable_data(0),
 55 |         _new_beta.mutable_data(0), num_words);
 56 |   }
 57 | 
 58 |   std::pair<float, float> FeedData(py::object& cols,
 59 |       py::object& indptr, py::object& vali, py::object& counts,
 60 |       py::object& gamma, const bool init_gamma,
 61 |       const int num_iters) {
 62 |     int_array _cols(cols);
 63 |     int_array _indptr(indptr);
 64 |     bool_array _vali(vali);
 65 |     float_array _counts(counts);
 66 |     float_array _gamma(gamma);
 67 |     auto cols_buffer = _cols.request();
 68 |     auto indptr_buffer = _indptr.request();
 69 |     auto vali_buffer = _vali.request();
 70 |     auto counts_buffer = _counts.request();
 71 |     auto gamma_buffer = _gamma.request();
 72 |     if (cols_buffer.ndim != 1 or
 73 |         indptr_buffer.ndim != 1 or
 74 |         vali_buffer.ndim != 1 or
 75 |         counts_buffer.ndim != 1 or
 76 |         gamma_buffer.ndim != 2) {
 77 |       throw std::runtime_error("invalid ndim");
 78 |     }
 79 |     int num_cols = cols_buffer.shape[0];
 80 |     int num_indptr = indptr_buffer.shape[0] - 1;
 81 | 
 82 |     if (vali_buffer.shape[0] != num_cols or
 83 |         counts_buffer.shape[0] != num_cols or
 84 |         gamma_buffer.shape[0] != num_indptr) {
 85 |       throw std::runtime_error("invalid length");
 86 |     }
 87 |     return obj_.FeedData(_cols.data(0), _indptr.data(0),
 88 |         _vali.data(0), _counts.data(0), _gamma.mutable_data(0),
 89 |         init_gamma, num_cols, num_indptr, num_iters);
 90 |   }
 91 | 
 92 |   void Pull() {
 93 |     obj_.Pull();
 94 |   }
 95 | 
 96 |   void Push() {
 97 |     obj_.Push();
 98 |   }
 99 | 
100 |   int GetBlockCnt() {
101 |     return obj_.GetBlockCnt();
102 |   }
103 | 
104 |  private:
105 |   cusim::CuLDA obj_;
106 | };
107 | 
108 | PYBIND11_PLUGIN(culda_bind) {
109 |   py::module m("CuLDABind");
110 | 
111 |   py::class_<CuLDABind>(m, "CuLDABind")
112 |   .def(py::init())
113 |   .def("init", &CuLDABind::Init, py::arg("opt_path"))
114 |   .def("load_model", &CuLDABind::LoadModel,
115 |       py::arg("alpha"), py::arg("beta"),
116 |       py::arg("grad_alpha"), py::arg("new_beta"))
117 |   .def("feed_data", &CuLDABind::FeedData,
118 |       py::arg("cols"), py::arg("indptr"), py::arg("vali"),
119 |       py::arg("counts"), py::arg("gamma"),
120 |       py::arg("init_gamma"), py::arg("num_iters"))
121 |   .def("pull", &CuLDABind::Pull)
122 |   .def("push", &CuLDABind::Push)
123 |   .def("get_block_cnt", &CuLDABind::GetBlockCnt)
124 |   .def("__repr__",
125 |   [](const CuLDABind &a) {
126 |     return "<CuLDABind>";
127 |   }
128 |   );
129 |   return m.ptr();
130 | }
131 | 


--------------------------------------------------------------------------------
/cusim/culda/pyculda.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 Jisang Yoon
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the Apache 2.0 license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | # pylint: disable=no-name-in-module,too-few-public-methods,no-member
  8 | import os
  9 | from os.path import join as pjoin
 10 | 
 11 | import json
 12 | import atexit
 13 | import shutil
 14 | import tempfile
 15 | 
 16 | import h5py
 17 | import numpy as np
 18 | from scipy.special import polygamma as pg
 19 | 
 20 | from cusim import aux, IoUtils
 21 | from cusim.culda.culda_bind import CuLDABind
 22 | from cusim.config_pb2 import CuLDAConfigProto
 23 | from cusim.constants import EPS, WARP_SIZE
 24 | 
 25 | 
 26 | class CuLDA:
 27 |   def __init__(self, opt=None):
 28 |     self.opt = aux.get_opt_as_proto(opt or {}, CuLDAConfigProto)
 29 |     self.logger = aux.get_logger("culda", level=self.opt.py_log_level)
 30 | 
 31 |     assert self.opt.block_dim <= WARP_SIZE ** 2 and \
 32 |       self.opt.block_dim % WARP_SIZE == 0, \
 33 |       f"invalid block dim ({self.opt.block_dim}, warp size: {WARP_SIZE})"
 34 | 
 35 |     tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)
 36 |     opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2)
 37 |     tmp.write(opt_content)
 38 |     tmp.close()
 39 | 
 40 |     self.logger.info("opt: %s", opt_content)
 41 |     self.obj = CuLDABind()
 42 |     assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}"
 43 |     os.remove(tmp.name)
 44 | 
 45 |     self.words, self.num_words, self.num_docs = None, None, None
 46 |     self.alpha, self.beta, self.grad_alpha, self.new_beta = \
 47 |       None, None, None, None
 48 | 
 49 |     self.tmp_dirs = []
 50 |     atexit.register(self.remove_tmp)
 51 | 
 52 |   def preprocess_data(self):
 53 |     if self.opt.skip_preprocess:
 54 |       return
 55 |     iou = IoUtils(aux.proto_to_dict(self.opt.io))
 56 |     if not self.opt.processed_data_path:
 57 |       data_dir = tempfile.TemporaryDirectory().name
 58 |       self.tmp_dirs.append(data_dir)
 59 |       self.opt.processed_data_path = pjoin(data_dir, "token.h5")
 60 |     iou.convert_bow_to_h5(self.opt.data_path, self.opt.processed_data_path)
 61 | 
 62 |   def init_model(self):
 63 |     # count number of docs and load voca
 64 |     assert os.path.exists(self.opt.processed_data_path)
 65 |     assert os.path.exists(self.opt.keys_path)
 66 |     h5f = h5py.File(self.opt.processed_data_path, "r")
 67 |     self.num_docs = h5f["indptr"].shape[0] - 1
 68 |     h5f.close()
 69 |     with open(self.opt.keys_path, "rb") as fin:
 70 |       self.words = [line.strip().decode("utf8") for line in fin]
 71 |     self.num_words = len(self.words)
 72 | 
 73 |     self.logger.info("number of words: %d, docs: %d",
 74 |                      self.num_words, self.num_docs)
 75 | 
 76 |     # random initialize alpha and beta
 77 |     np.random.seed(self.opt.seed)
 78 |     self.alpha = np.random.uniform( \
 79 |       size=(self.opt.num_topics,)).astype(np.float32)
 80 |     self.beta = np.random.uniform( \
 81 |       size=(self.num_words, self.opt.num_topics)).astype(np.float32)
 82 |     self.beta /= np.sum(self.beta, axis=0)[None, :]
 83 |     self.logger.info("alpha %s, beta %s initialized",
 84 |                      self.alpha.shape, self.beta.shape)
 85 | 
 86 |     # zero initialize grad alpha and new beta
 87 |     block_cnt = self.obj.get_block_cnt()
 88 |     self.grad_alpha = np.zeros(shape=(block_cnt, self.opt.num_topics),
 89 |                                dtype=np.float32)
 90 |     self.new_beta = np.zeros(shape=self.beta.shape, dtype=np.float32)
 91 |     self.logger.info("grad alpha %s, new beta %s initialized",
 92 |                      self.grad_alpha.shape, self.new_beta.shape)
 93 | 
 94 |     # set h5 file path to backup gamma
 95 |     if not self.opt.gamma_path:
 96 |       data_dir = tempfile.TemporaryDirectory().name
 97 |       self.tmp_dirs.append(data_dir)
 98 |       self.opt.gamma_path = pjoin(data_dir, "gamma.h5")
 99 |     self.logger.info("backup gamma to %s", self.opt.gamma_path)
100 |     os.makedirs(os.path.dirname(self.opt.gamma_path), exist_ok=True)
101 |     h5f = h5py.File(self.opt.gamma_path, "w")
102 |     h5f.create_dataset("gamma", shape=(self.num_docs, self.opt.num_topics),
103 |                        dtype=np.float32)
104 |     h5f.close()
105 | 
106 |     # push it to gpu
107 |     self.obj.load_model(self.alpha, self.beta, self.grad_alpha, self.new_beta)
108 | 
109 |   def train_model(self):
110 |     self.preprocess_data()
111 |     self.init_model()
112 |     h5f = h5py.File(self.opt.processed_data_path, "r")
113 |     for epoch in range(1, self.opt.epochs + 1):
114 |       gamma_h5f = h5py.File(self.opt.gamma_path, "r+")
115 |       self.logger.info("Epoch %d / %d", epoch, self.opt.epochs)
116 |       self._train_e_step(h5f, gamma_h5f["gamma"], epoch)
117 |       self._train_m_step()
118 |       gamma_h5f.close()
119 |     h5f.close()
120 | 
121 |   def _train_e_step(self, h5f, gamma_h5f, epoch):
122 |     offset, size = 0, h5f["cols"].shape[0]
123 |     pbar = aux.Progbar(size, stateful_metrics=["train_loss", "vali_loss"])
124 |     train_loss_nume, train_loss_deno = 0, 0
125 |     vali_loss_nume, vali_loss_deno = 0, 0
126 |     while True:
127 |       target = h5f["indptr"][offset] + self.opt.batch_size
128 |       if target < size:
129 |         next_offset = h5f["rows"][target]
130 |       else:
131 |         next_offset = h5f["indptr"].shape[0] - 1
132 |       indptr = h5f["indptr"][offset:next_offset + 1]
133 |       beg, end = indptr[0], indptr[-1]
134 |       indptr -= beg
135 |       cols = h5f["cols"][beg:end]
136 |       counts = h5f["counts"][beg:end]
137 |       vali = (h5f["vali"][beg:end] < self.opt.vali_p).astype(np.bool)
138 |       gamma = gamma_h5f[offset:next_offset, :]
139 | 
140 |       # call cuda kernel
141 |       train_loss, vali_loss = \
142 |         self.obj.feed_data(cols, indptr.astype(np.int32),
143 |                            vali, counts, gamma,
144 |                            epoch == 1 or self.opt.reuse_gamma,
145 |                            self.opt.num_iters_in_e_step)
146 | 
147 |       gamma_h5f[offset:next_offset, :] = gamma
148 |       # accumulate loss
149 |       train_loss_nume -= train_loss
150 |       vali_loss_nume -= vali_loss
151 |       train_loss_deno += np.sum(counts[~vali])
152 |       vali_loss_deno += np.sum(counts[vali])
153 |       train_loss = train_loss_nume / (train_loss_deno + EPS)
154 |       vali_loss = vali_loss_nume / (vali_loss_deno + EPS)
155 | 
156 |       # update progress bar
157 |       pbar.update(end, values=[("train_loss", train_loss),
158 |                                ("vali_loss", vali_loss)])
159 |       offset = next_offset
160 | 
161 |       if end == size:
162 |         break
163 | 
164 |   def _train_m_step(self):
165 |     self.obj.pull()
166 | 
167 |     # update beta
168 |     self.new_beta[:, :] = np.maximum(self.new_beta, EPS)
169 |     self.beta[:, :] = self.new_beta / np.sum(self.new_beta, axis=0)[None, :]
170 |     self.new_beta[:, :] = 0
171 | 
172 |     # update alpha
173 |     alpha_sum = np.sum(self.alpha)
174 |     gvec = np.sum(self.grad_alpha, axis=0)
175 |     gvec += self.num_docs * (pg(0, alpha_sum) - pg(0, self.alpha))
176 |     hvec = self.num_docs * pg(1, self.alpha)
177 |     z_0 = pg(1, alpha_sum)
178 |     c_nume = np.sum(gvec / hvec)
179 |     c_deno = 1 / z_0 + np.sum(1 / hvec)
180 |     c_0 = c_nume / c_deno
181 |     delta = (gvec - c_0) / hvec
182 |     self.alpha -= delta
183 |     self.alpha[:] = np.maximum(self.alpha, EPS)
184 |     self.grad_alpha[:,:] = 0
185 | 
186 |     self.obj.push()
187 | 
188 |   def save_h5_model(self, filepath, chunk_size=10000):
189 |     self.logger.info("save h5 format model path to %s", filepath)
190 |     os.makedirs(os.path.dirname(filepath), exist_ok=True)
191 |     h5f = h5py.File(filepath, "w")
192 |     h5f.create_dataset("alpha", data=self.alpha)
193 |     h5f.create_dataset("beta", data=self.beta)
194 |     h5f.create_dataset("keys", data=np.array([word.encode("utf")
195 |                                               for word in self.words]))
196 |     gamma = h5f.create_dataset("gamma", dtype=np.float32,
197 |                                shape=(self.num_docs, self.opt.num_topics))
198 |     h5f_gamma = h5py.File(self.opt.gamma_path, "r")
199 |     for offset in range(0, self.num_docs, chunk_size):
200 |       next_offset = min(self.num_docs, offset + chunk_size)
201 |       gamma[offset:next_offset, :] = h5f_gamma["gamma"][offset:next_offset, :]
202 |     h5f_gamma.close()
203 |     h5f.close()
204 | 
205 |   def remove_tmp(self):
206 |     if not self.opt.remove_tmp:
207 |       return
208 |     for tmp_dir in self.tmp_dirs:
209 |       if os.path.exists(tmp_dir):
210 |         self.logger.info("remove %s", tmp_dir)
211 |         shutil.rmtree(tmp_dir)
212 | 


--------------------------------------------------------------------------------
/cusim/cuw2v/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 Jisang Yoon
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the Apache 2.0 license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | from cusim.cuw2v.pycuw2v import CuW2V
7 | 


--------------------------------------------------------------------------------
/cusim/cuw2v/bindings.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2021 Jisang Yoon
  2 | //  All rights reserved.
  3 | //
  4 | //  This source code is licensed under the Apache 2.0 license found in the
  5 | //  LICENSE file in the root directory of this source tree.
  6 | #include <pybind11/pybind11.h>
  7 | #include <pybind11/numpy.h>
  8 | #include <pybind11/stl.h>
  9 | 
 10 | #include <iostream>
 11 | #include "cuw2v/cuw2v.hpp"
 12 | 
 13 | namespace py = pybind11;
 14 | 
 15 | typedef py::array_t<float, py::array::c_style | py::array::forcecast> float_array;
 16 | typedef py::array_t<double, py::array::c_style | py::array::forcecast> double_array;
 17 | typedef py::array_t<int, py::array::c_style | py::array::forcecast> int_array;
 18 | 
 19 | class CuW2VBind {
 20 |  public:
 21 |   CuW2VBind() {}
 22 | 
 23 |   bool Init(std::string opt_path) {
 24 |     return obj_.Init(opt_path);
 25 |   }
 26 | 
 27 |   void LoadModel(py::object& emb_in, py::object& emb_out) {
 28 |     // check shape of alpha and beta
 29 |     float_array _emb_in(emb_in);
 30 |     float_array _emb_out(emb_out);
 31 |     auto emb_in_buffer = _emb_in.request();
 32 |     auto emb_out_buffer = _emb_out.request();
 33 |     if (emb_in_buffer.ndim != 2 or emb_out_buffer.ndim != 2 or
 34 |         emb_in_buffer.shape[1] != emb_out_buffer.shape[1]) {
 35 |       throw std::runtime_error("invalid emb_in or emb_out");
 36 |     }
 37 | 
 38 |     return obj_.LoadModel(_emb_in.mutable_data(0), _emb_out.mutable_data(0));
 39 |   }
 40 | 
 41 |   void BuildRandomTable(py::object& word_count, int table_size) {
 42 |     double_array _word_count(word_count);
 43 |     auto wc_buffer = _word_count.request();
 44 |     if (wc_buffer.ndim != 1) {
 45 |       throw std::runtime_error("invalid word count");
 46 |     }
 47 |     int num_words = wc_buffer.shape[0];
 48 |     obj_.BuildRandomTable(_word_count.data(0), num_words, table_size);
 49 |   }
 50 | 
 51 |   void BuildHuffmanTree(py::object& word_count) {
 52 |     float_array _word_count(word_count);
 53 |     auto wc_buffer = _word_count.request();
 54 |     if (wc_buffer.ndim != 1) {
 55 |       throw std::runtime_error("invalid word count");
 56 |     }
 57 |     int num_words = wc_buffer.shape[0];
 58 |     obj_.BuildHuffmanTree(_word_count.data(0), num_words);
 59 |   }
 60 | 
 61 |   std::pair<float, float> FeedData(py::object& cols, py::object& indptr) {
 62 |     int_array _cols(cols);
 63 |     int_array _indptr(indptr);
 64 |     auto cols_buffer = _cols.request();
 65 |     auto indptr_buffer = _indptr.request();
 66 |     if (cols_buffer.ndim != 1 or indptr_buffer.ndim != 1) {
 67 |       throw std::runtime_error("invalid cols or indptr");
 68 |     }
 69 |     int num_cols = cols_buffer.shape[0];
 70 |     int num_indptr = indptr_buffer.shape[0] - 1;
 71 |     return obj_.FeedData(_cols.data(0), _indptr.data(0), num_cols, num_indptr);
 72 |   }
 73 | 
 74 |   void Pull() {
 75 |     obj_.Pull();
 76 |   }
 77 | 
 78 |  private:
 79 |   cusim::CuW2V obj_;
 80 | };
 81 | 
 82 | PYBIND11_PLUGIN(cuw2v_bind) {
 83 |   py::module m("CuW2VBind");
 84 | 
 85 |   py::class_<CuW2VBind>(m, "CuW2VBind")
 86 |   .def(py::init())
 87 |   .def("init", &CuW2VBind::Init, py::arg("opt_path"))
 88 |   .def("load_model", &CuW2VBind::LoadModel,
 89 |       py::arg("emb_in"), py::arg("emb_out"))
 90 |   .def("feed_data", &CuW2VBind::FeedData,
 91 |       py::arg("cols"), py::arg("indptr"))
 92 |   .def("pull", &CuW2VBind::Pull)
 93 |   .def("build_random_table", &CuW2VBind::BuildRandomTable,
 94 |       py::arg("word_count"), py::arg("table_size"))
 95 |   .def("build_huffman_tree", &CuW2VBind::BuildHuffmanTree,
 96 |       py::arg("word_count"))
 97 |   .def("__repr__",
 98 |   [](const CuW2VBind &a) {
 99 |     return "<CuW2VBind>";
100 |   }
101 |   );
102 |   return m.ptr();
103 | }
104 | 


--------------------------------------------------------------------------------
/cusim/cuw2v/pycuw2v.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 Jisang Yoon
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the Apache 2.0 license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | # pylint: disable=no-name-in-module,too-few-public-methods,no-member
  8 | import os
  9 | from os.path import join as pjoin
 10 | 
 11 | import json
 12 | import atexit
 13 | import shutil
 14 | import tempfile
 15 | 
 16 | import h5py
 17 | import numpy as np
 18 | 
 19 | from cusim import aux, IoUtils
 20 | from cusim.cuw2v.cuw2v_bind import CuW2VBind
 21 | from cusim.config_pb2 import CuW2VConfigProto
 22 | from cusim.constants import EPS, WARP_SIZE
 23 | 
 24 | class CuW2V:
 25 |   def __init__(self, opt=None):
 26 |     self.opt = aux.get_opt_as_proto(opt or {}, CuW2VConfigProto)
 27 |     self.logger = aux.get_logger("culda", level=self.opt.py_log_level)
 28 | 
 29 |     assert self.opt.block_dim <= WARP_SIZE ** 2 and \
 30 |       self.opt.block_dim % WARP_SIZE == 0, \
 31 |       f"invalid block dim ({self.opt.block_dim}, warp size: {WARP_SIZE})"
 32 | 
 33 |     tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)
 34 |     opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2)
 35 |     tmp.write(opt_content)
 36 |     tmp.close()
 37 | 
 38 |     self.logger.info("opt: %s", opt_content)
 39 |     self.obj = CuW2VBind()
 40 |     assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}"
 41 |     os.remove(tmp.name)
 42 | 
 43 |     self.words, self.word_count, self.num_words, self.num_docs = \
 44 |       None, None, None, None
 45 |     self.emb_in, self.emb_out = None, None
 46 |     self.tmp_dirs = []
 47 |     atexit.register(self.remove_tmp)
 48 | 
 49 |   def preprocess_data(self):
 50 |     if self.opt.skip_preprocess:
 51 |       return
 52 |     iou = IoUtils(aux.proto_to_dict(self.opt.io))
 53 |     if not self.opt.processed_data_dir:
 54 |       self.opt.processed_data_dir = tempfile.TemporaryDirectory().name
 55 |       self.tmp_dirs.append(self.opt.processed_data_dir)
 56 |     iou.convert_stream_to_h5(self.opt.data_path, self.opt.word_min_count,
 57 |                              self.opt.processed_data_dir)
 58 | 
 59 |   def init_model(self):
 60 |     # load voca
 61 |     data_dir = self.opt.processed_data_dir
 62 |     keys_path = pjoin(data_dir, "keys.txt")
 63 |     count_path = pjoin(data_dir, "count.txt")
 64 |     self.logger.info("load key, count from %s, %s", keys_path, count_path)
 65 |     with open(keys_path, "rb") as fin:
 66 |       self.words = [line.strip().decode("utf8") for line in fin]
 67 |     with open(count_path, "rb") as fin:
 68 |       self.word_count = np.array([int(line.strip()) for line in fin],
 69 |                                  dtype=np.int64)
 70 |     self.num_words = len(self.words)
 71 |     assert len(self.words) == len(self.word_count)
 72 | 
 73 |     # count number of docs
 74 |     h5f = h5py.File(pjoin(data_dir, "token.h5"), "r")
 75 |     self.num_docs = h5f["indptr"].shape[0] - 1
 76 |     h5f.close()
 77 | 
 78 |     self.logger.info("number of words: %d, docs: %d",
 79 |                      self.num_words, self.num_docs)
 80 | 
 81 |     # normalize word count
 82 |     word_count = np.power(self.word_count, self.opt.count_power,
 83 |                           dtype=np.float64)
 84 |     word_count /= np.sum(word_count)
 85 |     if self.opt.neg:
 86 |       self.obj.build_random_table(word_count, self.opt.random_size)
 87 |     else:
 88 |       self.obj.build_huffman_tree(word_count.astype(np.float32))
 89 | 
 90 |     # random initialize alpha and beta
 91 |     np.random.seed(self.opt.seed)
 92 |     scale = 1 / np.sqrt(self.opt.num_dims)
 93 |     self.emb_in = np.random.normal(loc=0, scale=scale, \
 94 |       size=(self.num_words, self.opt.num_dims)).astype(np.float32)
 95 |     out_words = self.num_words if self.opt.neg else self.num_words - 1
 96 |     self.emb_out = np.random.normal(loc=0, scale=scale, \
 97 |       size=(out_words, self.opt.num_dims)).astype(np.float32)
 98 |     self.logger.info("emb_in %s, emb_out %s initialized",
 99 |                      self.emb_in.shape, self.emb_out.shape)
100 | 
101 |     if self.opt.pretrained_model.filename:
102 |       self.load_word2vec_format(**aux.proto_to_dict(self.opt.pretrained_model))
103 | 
104 |     # push it to gpu
105 |     self.obj.load_model(self.emb_in, self.emb_out)
106 | 
107 |   def train_model(self):
108 |     self.preprocess_data()
109 |     self.init_model()
110 |     h5f = h5py.File(pjoin(self.opt.processed_data_dir, "token.h5"), "r")
111 |     for epoch in range(1, self.opt.epochs + 1):
112 |       self.logger.info("Epoch %d / %d", epoch, self.opt.epochs)
113 |       self._train_epoch(h5f)
114 |     self.obj.pull()
115 |     h5f.close()
116 | 
117 |   def _train_epoch(self, h5f):
118 |     offset, size = 0, h5f["cols"].shape[0]
119 |     pbar = aux.Progbar(size, stateful_metrics=["loss"])
120 |     loss_nume, loss_deno = 0, 0
121 |     while True:
122 |       target = h5f["indptr"][offset] + self.opt.batch_size
123 |       if target < size:
124 |         next_offset = h5f["rows"][target]
125 |       else:
126 |         next_offset = h5f["indptr"].shape[0] - 1
127 |       indptr = h5f["indptr"][offset:next_offset + 1]
128 |       beg, end = indptr[0], indptr[-1]
129 |       indptr -= beg
130 |       cols = h5f["cols"][beg:end]
131 |       offset = next_offset
132 | 
133 |       # call cuda kernel
134 |       _loss_nume, _loss_deno = \
135 |         self.obj.feed_data(cols, indptr.astype(np.int32))
136 | 
137 |       # accumulate loss
138 |       loss_nume += _loss_nume
139 |       loss_deno += _loss_deno
140 |       loss = loss_nume / (loss_deno + EPS)
141 | 
142 |       # update progress bar
143 |       pbar.update(end, values=[("loss", loss)])
144 |       if end == size:
145 |         break
146 | 
147 |   def save_h5_model(self, filename):
148 |     self.logger.info("save h5 format model to %s", filename)
149 |     os.makedirs(os.path.dirname(filename), exist_ok=True)
150 |     h5f = h5py.File(filename, "w")
151 |     h5f.create_dataset("emb_in", data=self.emb_in)
152 |     h5f.create_dataset("emb_out", data=self.emb_out)
153 |     h5f.create_dataset("keys", data=np.array([word.encode("utf")
154 |                                               for word in self.words]))
155 |     h5f.close()
156 | 
157 |   def save_word2vec_format(self, filename, binary=False, prefix=""):
158 |     self.logger.info("save word2vec format model to %s, "
159 |                      "binary: %s, prefix: '%s'", filename, binary, prefix)
160 |     # save model compatible with gensim and original w2v code by Google
161 |     with open(filename, "wb") as fout:
162 |       fout.write(f"{self.num_words} {self.opt.num_dims}\n".encode("utf8"))
163 |       for idx, word in enumerate(self.words):
164 |         vec = self.emb_in[idx]
165 |         if binary:
166 |           fout.write(f"{prefix}{word} ".encode("utf8") + vec.tobytes())
167 |         else:
168 |           fout.write(f"{prefix}{word} "
169 |                      f"{' '.join(repr(val) for val in vec)}\n".encode("utf8"))
170 | 
171 |   def load_word2vec_format(self, filename, binary=False,
172 |                             symmetry=False, no_header=False):
173 |     self.logger.info("load pretrained model from %s", filename)
174 |     # copy pretrained model to emb_out as well only if
175 |     # we use negative sampling, NOT hierarchical softmax
176 |     assert not symmetry or self.opt.neg, "no symmetry in hierarchical softmax"
177 | 
178 |     # read variable
179 |     vector_dict = {}
180 |     with open(filename, "rb") as fin:
181 |       if not no_header:
182 |         fin.readline()  # throw one line
183 |       for line in fin:
184 |         if binary:
185 |           key, vec = line.split()
186 |           vector_dict[key] = np.fromstring(vec, dtype=np.float32)
187 |         else:
188 |           line_vec = line.strip().split()
189 |           key = line_vec[0].decode("utf8")
190 |           vec = np.array([float(val) for val in line_vec[1:]],
191 |                          dtype=np.float32)
192 |           vector_dict[key] = vec
193 | 
194 |     # copy to variable
195 |     loaded_cnt = 0
196 |     word_idmap = {word: idx for idx, word in enumerate(self.words)}
197 |     for key, vec in vector_dict.items():
198 |       assert len(vec) == self.opt.num_dims
199 |       if key not in word_idmap:
200 |         continue
201 |       idx = word_idmap[key]
202 |       loaded_cnt += 1
203 |       self.emb_in[idx, :] = vec
204 |       if symmetry:
205 |         self.emb_out[idx, :] = vec
206 |     self.logger.info("loaded count: %d", loaded_cnt)
207 | 
208 |   def remove_tmp(self):
209 |     if not self.opt.remove_tmp:
210 |       return
211 |     for tmp_dir in self.tmp_dirs:
212 |       if os.path.exists(tmp_dir):
213 |         self.logger.info("remove %s", tmp_dir)
214 |         shutil.rmtree(tmp_dir)
215 | 


--------------------------------------------------------------------------------
/cusim/ioutils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021 Jisang Yoon
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the Apache 2.0 license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | from cusim.ioutils.pyioutils import IoUtils
7 | 


--------------------------------------------------------------------------------
/cusim/ioutils/bindings.cc:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2021 Jisang Yoon
  2 | //  All rights reserved.
  3 | //
  4 | //  This source code is licensed under the Apache 2.0 license found in the
  5 | //  LICENSE file in the root directory of this source tree.
  6 | #include <pybind11/pybind11.h>
  7 | #include <pybind11/numpy.h>
  8 | #include <pybind11/stl.h>
  9 | 
 10 | #include <iostream>
 11 | #include "utils/ioutils.hpp"
 12 | 
 13 | namespace py = pybind11;
 14 | 
 15 | typedef py::array_t<float, py::array::c_style | py::array::forcecast> float_array;
 16 | typedef py::array_t<int, py::array::c_style | py::array::forcecast> int_array;
 17 | typedef py::array_t<int64_t, py::array::c_style | py::array::forcecast> int64_array;
 18 | 
 19 | class IoUtilsBind {
 20 |  public:
 21 |   IoUtilsBind() {}
 22 | 
 23 |   bool Init(std::string opt_path) {
 24 |     return obj_.Init(opt_path);
 25 |   }
 26 | 
 27 |   int64_t LoadStreamFile(std::string filepath) {
 28 |     return obj_.LoadStreamFile(filepath);
 29 |   }
 30 | 
 31 |   std::pair<int, int> ReadStreamForVocab(int num_lines, int num_threads) {
 32 |     return obj_.ReadStreamForVocab(num_lines, num_threads);
 33 |   }
 34 | 
 35 |   std::pair<int, int> TokenizeStream(int num_lines, int num_threads) {
 36 |     return obj_.TokenizeStream(num_lines, num_threads);
 37 |   }
 38 | 
 39 |   void GetWordVocab(int min_count, std::string keys_path, std::string count_path) {
 40 |     obj_.GetWordVocab(min_count, keys_path, count_path);
 41 |   }
 42 | 
 43 |   void GetToken(py::object& rows, py::object& cols, py::object& indptr) {
 44 |     int_array _rows(rows);
 45 |     int_array _cols(cols);
 46 |     int_array _indptr(indptr);
 47 |     obj_.GetToken(_rows.mutable_data(0), _cols.mutable_data(0), _indptr.mutable_data(0));
 48 |   }
 49 | 
 50 |   std::tuple<int64_t, int, int64_t> ReadBagOfWordsHeader(std::string filepath) {
 51 |     return obj_.ReadBagOfWordsHeader(filepath);
 52 |   }
 53 | 
 54 |   void ReadBagOfWordsContent(py::object& rows, py::object& cols,
 55 |       py::object counts) {
 56 |     int64_array _rows(rows);
 57 |     int_array _cols(cols);
 58 |     float_array _counts(counts);
 59 |     auto rows_buffer = _rows.request();
 60 |     auto cols_buffer = _cols.request();
 61 |     auto counts_buffer = _counts.request();
 62 |     int num_lines = rows_buffer.shape[0];
 63 |     if (cols_buffer.shape[0] != num_lines or counts_buffer.shape[0] != num_lines) {
 64 |       throw std::runtime_error("invalid shape");
 65 |     }
 66 |     obj_.ReadBagOfWordsContent(_rows.mutable_data(0),
 67 |         _cols.mutable_data(0), _counts.mutable_data(0), num_lines);
 68 |   }
 69 | 
 70 |  private:
 71 |   cusim::IoUtils obj_;
 72 | };
 73 | 
 74 | PYBIND11_PLUGIN(ioutils_bind) {
 75 |   py::module m("IoUtilsBind");
 76 | 
 77 |   py::class_<IoUtilsBind>(m, "IoUtilsBind")
 78 |   .def(py::init())
 79 |   .def("init", &IoUtilsBind::Init, py::arg("opt_path"))
 80 |   .def("load_stream_file", &IoUtilsBind::LoadStreamFile, py::arg("filepath"))
 81 |   .def("read_stream_for_vocab", &IoUtilsBind::ReadStreamForVocab,
 82 |       py::arg("num_lines"), py::arg("num_threads"))
 83 |   .def("tokenize_stream", &IoUtilsBind::TokenizeStream,
 84 |       py::arg("num_lines"), py::arg("num_threads"))
 85 |   .def("get_word_vocab", &IoUtilsBind::GetWordVocab,
 86 |       py::arg("min_count"), py::arg("keys_path"), py::arg("count_path"))
 87 |   .def("get_token", &IoUtilsBind::GetToken,
 88 |       py::arg("indices"), py::arg("indptr"), py::arg("offset"))
 89 |   .def("read_bag_of_words_header", &IoUtilsBind::ReadBagOfWordsHeader,
 90 |       py::arg("filepath"))
 91 |   .def("read_bag_of_words_content", &IoUtilsBind::ReadBagOfWordsContent,
 92 |       py::arg("rows"), py::arg("cols"), py::arg("counts"))
 93 |   .def("__repr__",
 94 |   [](const IoUtilsBind &a) {
 95 |     return "<IoUtilsBind>";
 96 |   }
 97 |   );
 98 |   return m.ptr();
 99 | }
100 | 


--------------------------------------------------------------------------------
/cusim/ioutils/pyioutils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 Jisang Yoon
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the Apache 2.0 license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | # pylint: disable=no-name-in-module,too-few-public-methods,no-member
  8 | import os
  9 | from os.path import join as pjoin
 10 | 
 11 | import json
 12 | import tempfile
 13 | 
 14 | import h5py
 15 | import numpy as np
 16 | 
 17 | from cusim import aux
 18 | from cusim.ioutils.ioutils_bind import IoUtilsBind
 19 | from cusim.config_pb2 import IoUtilsConfigProto
 20 | 
 21 | class IoUtils:
 22 |   def __init__(self, opt=None):
 23 |     self.opt = aux.get_opt_as_proto(opt or {}, IoUtilsConfigProto)
 24 |     self.logger = aux.get_logger("ioutils", level=self.opt.py_log_level)
 25 | 
 26 |     tmp = tempfile.NamedTemporaryFile(mode='w', delete=False)
 27 |     opt_content = json.dumps(aux.proto_to_dict(self.opt), indent=2)
 28 |     tmp.write(opt_content)
 29 |     tmp.close()
 30 | 
 31 |     self.logger.info("opt: %s", opt_content)
 32 |     self.obj = IoUtilsBind()
 33 |     assert self.obj.init(bytes(tmp.name, "utf8")), f"failed to load {tmp.name}"
 34 |     os.remove(tmp.name)
 35 | 
 36 |   def load_stream_vocab(self, filepath, min_count,
 37 |                         keys_path, count_path):
 38 |     full_num_lines = self.obj.load_stream_file(filepath)
 39 |     pbar = aux.Progbar(full_num_lines, unit_name="line",
 40 |                         stateful_metrics=["word_count"])
 41 |     processed = 0
 42 |     while True:
 43 |       read_lines, word_count = \
 44 |         self.obj.read_stream_for_vocab(
 45 |           self.opt.chunk_lines, self.opt.num_threads)
 46 |       processed += read_lines
 47 |       pbar.update(processed, values=[("word_count", word_count)])
 48 |       if processed == full_num_lines:
 49 |         break
 50 |     self.obj.get_word_vocab(min_count, keys_path, count_path)
 51 | 
 52 |   def convert_stream_to_h5(self, filepath, min_count, out_dir,
 53 |                            chunk_indices=10000, seed=777):
 54 |     np.random.seed(seed)
 55 |     os.makedirs(out_dir, exist_ok=True)
 56 |     keys_path = pjoin(out_dir, "keys.txt")
 57 |     count_path = pjoin(out_dir, "count.txt")
 58 |     token_path = pjoin(out_dir, "token.h5")
 59 |     self.logger.info("save key, count, token to %s, %s, %s",
 60 |                      keys_path, count_path, token_path)
 61 |     self.load_stream_vocab(filepath, min_count, keys_path, count_path)
 62 |     full_num_lines = self.obj.load_stream_file(filepath)
 63 |     pbar = aux.Progbar(full_num_lines, unit_name="line")
 64 |     processed = 0
 65 |     h5f = h5py.File(token_path, "w")
 66 |     rows = h5f.create_dataset("rows", shape=(chunk_indices,),
 67 |                               maxshape=(None,), dtype=np.int64,
 68 |                               chunks=(chunk_indices,))
 69 |     cols = h5f.create_dataset("cols", shape=(chunk_indices,),
 70 |                               maxshape=(None,), dtype=np.int32,
 71 |                               chunks=(chunk_indices,))
 72 |     vali = h5f.create_dataset("vali", shape=(chunk_indices,),
 73 |                               maxshape=(None,), dtype=np.float32,
 74 |                               chunks=(chunk_indices,))
 75 |     indptr =  h5f.create_dataset("indptr", shape=(full_num_lines + 1,),
 76 |                                  dtype=np.int64, chunks=True)
 77 |     processed, offset = 1, 0
 78 |     indptr[0] = 0
 79 |     while True:
 80 |       read_lines, data_size = self.obj.tokenize_stream(
 81 |         self.opt.chunk_lines, self.opt.num_threads)
 82 |       _rows = np.empty(shape=(data_size,), dtype=np.int32)
 83 |       _cols = np.empty(shape=(data_size,), dtype=np.int32)
 84 |       _indptr = np.empty(shape=(read_lines,), dtype=np.int32)
 85 |       self.obj.get_token(_rows, _cols, _indptr)
 86 |       rows.resize((offset + data_size,))
 87 |       rows[offset:offset + data_size] = \
 88 |         _rows.astype(np.int64) + (processed - 1)
 89 |       cols.resize((offset + data_size,))
 90 |       cols[offset:offset + data_size] = _cols
 91 |       vali.resize((offset + data_size,))
 92 |       vali[offset:offset + data_size] = \
 93 |         np.random.uniform(size=(data_size,)).astype(np.float32)
 94 |       indptr[processed:processed + read_lines] = \
 95 |         _indptr.astype(np.int64) + offset
 96 |       offset += data_size
 97 |       processed += read_lines
 98 |       pbar.update(processed - 1)
 99 |       if processed == full_num_lines + 1:
100 |         break
101 |     h5f.close()
102 | 
103 |   def convert_bow_to_h5(self, filepath, h5_path):
104 |     self.logger.info("convert bow %s to h5 %s", filepath, h5_path)
105 |     num_docs, num_words, num_lines = \
106 |       self.obj.read_bag_of_words_header(filepath)
107 |     self.logger.info("number of docs: %d, words: %d, nnz: %d",
108 |                      num_docs, num_words, num_lines)
109 |     h5f = h5py.File(h5_path, "w")
110 |     rows = h5f.create_dataset("rows", dtype=np.int64,
111 |                               shape=(num_lines,), chunks=True)
112 |     cols = h5f.create_dataset("cols", dtype=np.int32,
113 |                               shape=(num_lines,), chunks=True)
114 |     counts = h5f.create_dataset("counts", dtype=np.float32,
115 |                                 shape=(num_lines,), chunks=True)
116 |     vali = h5f.create_dataset("vali", dtype=np.float32,
117 |                               shape=(num_lines,), chunks=True)
118 |     indptr = h5f.create_dataset("indptr", dtype=np.int64,
119 |                                 shape=(num_docs + 1,), chunks=True)
120 |     indptr[0] = 0
121 |     processed, recent_row, indptr_offset = 0, 0, 1
122 |     pbar = aux.Progbar(num_lines, unit_name="line")
123 |     while processed < num_lines:
124 |       # get chunk size
125 |       read_lines = min(num_lines - processed, self.opt.chunk_lines)
126 | 
127 |       # copy rows, cols, counts to h5
128 |       _rows = np.empty((read_lines,), dtype=np.int64)
129 |       _cols = np.empty((read_lines,), dtype=np.int32)
130 |       _counts = np.empty((read_lines,), dtype=np.float32)
131 |       self.obj.read_bag_of_words_content(_rows, _cols, _counts)
132 |       rows[processed:processed + read_lines] = _rows
133 |       cols[processed:processed + read_lines] = _cols
134 |       counts[processed:processed + read_lines] = _counts
135 |       vali[processed:processed + read_lines] = \
136 |         np.random.uniform(size=(read_lines,)).astype(np.float32)
137 | 
138 |       # compute indptr
139 |       prev_rows = np.zeros((read_lines,), dtype=np.int64)
140 |       prev_rows[1:] = _rows[:-1]
141 |       prev_rows[0] = recent_row
142 |       diff = _rows - prev_rows
143 |       indices = np.where(diff > 0)[0]
144 |       _indptr = []
145 |       for idx in indices:
146 |         _indptr += ([processed + idx] * diff[idx])
147 |       if _indptr:
148 |         indptr[indptr_offset:indptr_offset + len(_indptr)] = \
149 |           np.array(_indptr, dtype=np.int64)
150 |         indptr_offset += len(_indptr)
151 | 
152 |       # udpate processed
153 |       processed += read_lines
154 |       pbar.update(processed)
155 |       recent_row = _rows[-1]
156 | 
157 |     # finalize indptr
158 |     _indptr = [num_lines] * (num_docs + 1 - indptr_offset)
159 |     indptr[indptr_offset:num_docs + 1] = np.array(_indptr, dtype=np.int64)
160 | 
161 |     h5f.close()
162 | 


--------------------------------------------------------------------------------
/cusim/proto/config.proto:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2021 Jisang Yoon
  2 | // All rights reserved.
  3 | //
  4 | // This source code is licensed under the Apache 2.0 license found in the
  5 | // LICENSE file in the root directory of this source tree.
  6 | 
  7 | syntax = "proto2";
  8 | 
  9 | 
 10 | // option for data preprocessing
 11 | message IoUtilsConfigProto {
 12 |   // logging levels in python and C++
 13 |   optional int32 py_log_level = 1 [default = 2];
 14 |   optional int32 c_log_level = 2 [default = 2];
 15 | 
 16 |   // number of chunk lines to preprocess (txt => hdf5 format) data
 17 |   optional int32 chunk_lines = 3 [default = 100000];
 18 | 
 19 |   // number of concurrent threads in data preprocessing
 20 |   optional int32 num_threads = 4 [default = 4];
 21 | 
 22 |   // convert charater to lower case if true
 23 |   optional bool lower = 5 [default = true];
 24 | }
 25 | 
 26 | 
 27 | // option for LDA model
 28 | message CuLDAConfigProto {
 29 |   // logging levels in python and C++
 30 |   optional int32 py_log_level = 1 [default = 2];
 31 |   optional int32 c_log_level = 2 [default = 2];
 32 | 
 33 |   // raw data path (format from https://archive.ics.uci.edu/ml/datasets/bag+of+words)
 34 |   optional string data_path = 7;
 35 | 
 36 |   // preprocessed data path (hdf5 format)
 37 |   // if empty, make temporary directory
 38 |   optional string processed_data_path = 6;
 39 | 
 40 |   // vocabulary path
 41 |   required string keys_path = 16;
 42 | 
 43 |   // skip preprocess (there should be already preprocessed hdf5 format) if true
 44 |   optional bool skip_preprocess = 8;
 45 | 
 46 |   // path to store gamma in E step
 47 |   // if empty, make temporary directory
 48 |   optional string gamma_path = 17;
 49 | 
 50 |   // reuse gamma from previous epoch if true
 51 |   // if false, initiate gamma as Figure 6 in https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf
 52 |   optional bool reuse_gamma = 18;
 53 | 
 54 |   // number of topics
 55 |   optional int32 num_topics = 3 [default = 10];
 56 | 
 57 |   // block dimension in CUDA
 58 |   // should be multiple of WARP_SIZE (=32)
 59 |   optional int32 block_dim = 4 [default = 32];
 60 | 
 61 |   // set the number blocks as num_blocks * block_dim = physical_cores_in_GPU * hyper_threads
 62 |   optional int32 hyper_threads = 5 [default = 100];
 63 | 
 64 |   // batch size in training
 65 |   optional int32 batch_size = 10 [default = 1000000];
 66 | 
 67 |   // number of epochs in training
 68 |   optional int32 epochs = 11 [default = 10];
 69 | 
 70 |   // number of iterations in each E step
 71 |   optional int32 num_iters_in_e_step = 12 [default = 5];
 72 | 
 73 |   // validation ratio, should be between 0 and 1
 74 |   optional double vali_p = 13 [default = 0.2];
 75 | 
 76 |   // random seed
 77 |   optional int32 seed = 14 [default = 777];
 78 | 
 79 |   // remove all tempory directorys generated by package when program finnished if true
 80 |   optional bool remove_tmp = 19 [default = true];
 81 | 
 82 |   optional IoUtilsConfigProto io = 15;
 83 | }
 84 | 
 85 | // options for loading pretrained w2v model
 86 | // can load w2v model file generated by gensim or original w2v code by Google
 87 | message W2VPretrainedModel {
 88 |   optional string filename = 1;
 89 |   optional bool no_header = 2;
 90 |   optional bool binary = 3;
 91 |   optional bool symmetry = 4;
 92 | }
 93 | 
 94 | 
 95 | // option for training Word2Vec model
 96 | message CuW2VConfigProto {
 97 |   // logging levels in python and C++
 98 |   optional int32 py_log_level = 1 [default = 2];
 99 |   optional int32 c_log_level = 2 [default = 2];
100 | 
101 |   // raw data path (stream txt format)
102 |   optional string data_path = 7;
103 | 
104 |   // path to save preprocessed data (hdf5 format)
105 |   optional string processed_data_dir = 6;
106 | 
107 |   // skip data preprocessing (therefore, there should be
108 |   // already preprocessed hdf5 format file) if true
109 |   optional bool skip_preprocess = 8;
110 | 
111 |   // number of embedding dimensions
112 |   optional int32 num_dims = 3 [default = 50];
113 | 
114 |   // block_dim in CUDA
115 |   optional int32 block_dim = 4 [default = 32];
116 | 
117 |   // set number of blocks as num_blocks * block_dim = physical_cores_in_GPU * hyper_threads
118 |   optional int32 hyper_threads = 5 [default = 100];
119 | 
120 |   // generate vocabulary with words appreared in corpus at least word_min_count times
121 |   optional int32 word_min_count = 9 [default = 5];
122 | 
123 |   // batch size and number of epochs in training
124 |   optional int32 batch_size = 10 [default = 1000000];
125 |   optional int32 epochs = 11 [default = 10];
126 | 
127 |   // seed fields
128 |   optional int32 seed = 14 [default = 777];
129 | 
130 |   // random table size in negative sampling
131 |   optional int32 random_size = 12 [default = 100000000];
132 | 
133 |   // number of negative samples
134 |   // if zero, it uses hierarchical softmax
135 |   optional int32 neg = 17 [default = 10];
136 | 
137 |   // weight in negative sampling will be word_count ** count_power for each word
138 |   // default value 0.75 is recommended in w2v paper
139 |   optional double count_power = 18 [default = 0.75];
140 | 
141 |   // if true, train skip gram model, else train cbow model
142 |   optional bool skip_gram = 19 [default = true];
143 | 
144 |   // if true, use average context vector in cbow model
145 |   // else use summation of context vectors
146 |   optional bool cbow_mean = 20 [default = true];
147 | 
148 |   // learning rate
149 |   optional double lr = 21 [default = 0.001];
150 | 
151 |   // window size in both skip gram and cbow model
152 |   optional int32 window_size = 22 [default = 5];
153 | 
154 |   // remove all tempory directorys generated by package when program finnished if true
155 |   optional bool remove_tmp = 26 [default = true];
156 | 
157 |   optional IoUtilsConfigProto io = 24;
158 |   optional W2VPretrainedModel pretrained_model = 25;
159 | }
160 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = python -msphinx
 7 | SPHINXPROJ    = cusim
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # pylint: disable=invalid-name,unused-import,redefined-builtin
 2 | 
 3 | # Configuration file for the Sphinx documentation builder.
 4 | #
 5 | # This file only contains a selection of the most common options. For a full
 6 | # list see the documentation:
 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 8 | 
 9 | # -- Path setup --------------------------------------------------------------
10 | 
11 | # If extensions (or modules to document with autodoc) are in another directory,
12 | # add these directories to sys.path here. If the directory is relative to the
13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
14 | #
15 | # import os
16 | # import sys
17 | # sys.path.insert(0, os.path.abspath('.'))
18 | 
19 | 
20 | # -- Project information -----------------------------------------------------
21 | import sphinx_rtd_theme
22 | 
23 | project = 'cusim'
24 | copyright = '2021, Jisang Yoon'
25 | author = 'Jisang Yoon'
26 | 
27 | # The full version, including alpha/beta/rc tags
28 | release = "0.0.1"
29 | 
30 | 
31 | # -- General configuration ---------------------------------------------------
32 | 
33 | # Add any Sphinx extension module names here, as strings. They can be
34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
35 | # ones.
36 | extensions = [
37 |   "sphinx_rtd_theme",
38 |   "sphinx.ext.autodoc",
39 |   "sphinx.ext.napoleon"
40 | ]
41 | 
42 | # Add any paths that contain templates here, relative to this directory.
43 | templates_path = ['_templates']
44 | 
45 | # List of patterns, relative to source directory, that match files and
46 | # directories to ignore when looking for source files.
47 | # This patterns also effect to html_static_path and html_extra_path
48 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
49 | 
50 | # The name of the Pygments (syntax highlighting) style to use.
51 | pygments_style = "sphinx"
52 | 
53 | # If true, `todo` and `todoList` produce output, else they produce nothing.
54 | todo_include_todos = False
55 | 
56 | # -- Options for HTML output -------------------------------------------------
57 | 
58 | # The theme to use for HTML and HTML Help pages.  See the documentation for
59 | # a list of builtin themes.
60 | #
61 | html_theme = "sphinx_rtd_theme"
62 | 
63 | # Add any paths that contain custom static files (such as style sheets) here,
64 | # relative to this directory. They are copied after the builtin static files,
65 | # so a file named "default.css" will overwrite the builtin "default.css".
66 | html_static_path = ['_static']
67 | 
68 | html_sidebars = {
69 |     "**": [
70 |         "about.html",
71 |         "navigation.html",
72 |         "relations.html",  # needs 'show_related': True theme option to display
73 |         "searchbox.html",
74 |         "donate.html",
75 |     ]
76 | }
77 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. cusim documentation master file, created by
 2 |    sphinx-quickstart on Sat Feb 20 13:36:31 2021.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | CUSIM - Superfast implementation of Word2Vec and LDA
 7 | ====================================================
 8 | 
 9 | 
10 | CUSIM is a project to speed up various ML models (e.g. topic modeling, word embedding, etc) by CUDA. It would be nice to think of it as `gensim <https://github.com/RaRe-Technologies/gensim>`_'s GPU version project. As a starting step, I implemented the most widely used word embedding model, the `word2vec <https://arxiv.org/pdf/1301.3781.pdf>`_ model, and the most representative topic model, the `LDA (Latent Dirichlet Allocation) <https://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf>`_ model.
11 | 
12 | 
13 | 
14 | .. toctree::
15 |    :maxdepth: 2
16 |    :caption: Contents
17 | 
18 |    Installation <install>
19 |    Word2Vec <w2v>
20 |    LDA <lda>
21 | 
22 | 
23 | Indices and tables
24 | ==================
25 | 
26 | * :ref:`genindex`
27 | * :ref:`modindex`
28 | * :ref:`search`
29 | 


--------------------------------------------------------------------------------
/docs/install.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | Install from pypi
 5 | -----------------
 6 | 
 7 | .. code-block:: shell
 8 | 
 9 |   pip install cusim
10 | 
11 | 
12 | Install from source
13 | --------------------
14 | 
15 | .. code-block:: shell
16 | 
17 |   # clone repo and submodules
18 |   git clone git@github.com:js1010/cusim.git && cd cusim && git submodule update --init
19 | 
20 |   # install requirements
21 |   pip install -r requirements.txt
22 | 
23 |   # generate proto
24 |   python -m grpc_tools.protoc --python_out cusim/ --proto_path cusim/proto/ config.proto
25 | 
26 |   # install
27 |   python setup.py install
28 | 


--------------------------------------------------------------------------------
/docs/lda.rst:
--------------------------------------------------------------------------------
  1 | LDA
  2 | ===
  3 | 
  4 | 
  5 | Parameters
  6 | ----------
  7 | 
  8 | 
  9 | - See `CuLDAConfigProto <https://github.com/js1010/cusim/blob/f12d18a65fc603b99350705b235d374654c87517/cusim/proto/config.proto#L27-L83>`_ 
 10 | 
 11 | 
 12 | Example Codes
 13 | -------------
 14 | 
 15 | - Full source code is in `examples/example_lda.py <https://github.com/js1010/cusim/blob/main/examples/example_lda.py>`_
 16 | 
 17 | - Before running example codes, run 
 18 | 
 19 | .. code-block:: shell
 20 | 
 21 |   pip install -r examples/requirements.txt
 22 | 
 23 | 
 24 | - Download and preprocess data
 25 | 
 26 | .. code-block:: python
 27 |   
 28 |   import os
 29 |   from os.path import join as pjoin
 30 |   import subprocess
 31 |   
 32 |   import wget
 33 | 
 34 |   DATASET = "nytimes"
 35 |   DIR_PATH = "./res"
 36 |   BASE_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/" \
 37 |             "bag-of-words/"
 38 | 
 39 | 
 40 |   # download docword
 41 |   filename = f"docword.{DATASET}.txt.gz"
 42 |   out_path = pjoin(DIR_PATH, filename)
 43 |   wget.download(BASE_URL + filename, out=out_path)
 44 |   print()
 45 | 
 46 |   # decompress
 47 |   cmd = ["gunzip", "-c", out_path, ">",
 48 |          pjoin(DIR_PATH, f"docword.{DATASET}.txt")]
 49 |   cmd = " ".join(cmd)
 50 |   subprocess.call(cmd, shell=True)
 51 |   os.remove(pjoin(DIR_PATH, filename))
 52 | 
 53 |   # download vocab
 54 |   filename = f"vocab.{DATASET}.txt"
 55 |   out_path = pjoin(DIR_PATH, filename)
 56 |   wget.download(BASE_URL + filename, out=out_path)
 57 |   print()
 58 | 
 59 | - Train cusim word2vec
 60 | 
 61 | .. code-block:: python
 62 |   
 63 |   from cusim import CuLDA
 64 | 
 65 |   data_path = pjoin(DIR_PATH, f"docword.{DATASET}.txt")
 66 |   keys_path = pjoin(DIR_PATH, f"vocab.{DATASET}.txt")
 67 |   processed_data_path = pjoin(DIR_PATH, f"docword.{DATASET}.h5")
 68 |   opt = {
 69 |     "data_path": data_path,
 70 |     "processed_data_path": processed_data_path,
 71 |     "keys_path": keys_path,
 72 |     "num_topics": 50,
 73 |     "num_iters_in_e_step": 10,
 74 |     "reuse_gamma": True,
 75 |     # "skip_preprocess": os.path.exists(processed_data_path),
 76 |   }
 77 |   start = time.time()
 78 |   lda = CuLDA(opt)
 79 |   lda.train_model()
 80 | 
 81 | 
 82 | - Save and evaluate model
 83 | 
 84 | .. code-block:: python
 85 |   
 86 |   h5_model_path = pjoin(DIR_PATH, "cusim.lda.model.h5")
 87 |   lda.save_h5_model(h5_model_path)
 88 | 
 89 |   h5f = h5py.File(h5_model_path, "r")
 90 |   beta = h5f["beta"][:, :].T
 91 |   keys = h5f["keys"][:]
 92 |   topk = 10
 93 |   
 94 |   for idx in range(beta.shape[0]):
 95 |     print("=" * 50)
 96 |     print(f"topic {idx + 1}")
 97 |     print("-" * 50)
 98 |     _beta = beta[idx, :]
 99 |     indices = np.argsort(-_beta)[:topk]
100 |     for rank, wordid in enumerate(indices):
101 |       word = keys[wordid].decode("utf8")
102 |       prob = _beta[wordid]
103 |       print(f"rank {rank + 1}. {word}: {prob}")
104 | 
105 | 
106 | Performance
107 | -----------
108 | 
109 | - Data: `nytimes dataset <https://archive.ics.uci.edu/ml/datasets/bag+of+words>`_
110 | - Topic Results
111 |     - `cusim lda results <https://github.com/js1010/cusim/blob/main/examples/cusim.topics.txt>`_
112 |     - `gensim lda results <https://github.com/js1010/cusim/blob/main/examples/gensim.topics.txt>`_
113 | - Time Performance
114 |     - Experimented in `AWS g4dn 2xlarge <https://aws.amazon.com/ec2/instance-types/g4/>`_ (One NVIDIA T4 and 8 vcpus of 8 Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz)
115 | 
116 | +---------------------+-------------------+--------------------+
117 | | attr                |   gensim (8 vpus) |   cusim (NVIDIA T4)|
118 | +=====================+===================+====================+
119 | | training time (sec) |           447.376 | **76.6972**        |
120 | +---------------------+-------------------+--------------------+
121 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=python -msphinx
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | set SPHINXPROJ=cusim
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The Sphinx module was not found. Make sure you have Sphinx installed,
20 | 	echo.then set the SPHINXBUILD environment variable to point to the full
21 | 	echo.path of the 'sphinx-build' executable. Alternatively you may add the
22 | 	echo.Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/w2v.rst:
--------------------------------------------------------------------------------
  1 | Word2Vec
  2 | ========
  3 | 
  4 | 
  5 | Parameters
  6 | ----------
  7 | 
  8 | 
  9 | - See `CuW2VConfigProto <https://github.com/js1010/cusim/blob/f12d18a65fc603b99350705b235d374654c87517/cusim/proto/config.proto#L95-L159>`_ 
 10 | 
 11 | 
 12 | Example Codes
 13 | -------------
 14 | 
 15 | - Full source code is in `examples/example_w2v.py <https://github.com/js1010/cusim/blob/main/examples/example_w2v.py>`_
 16 | 
 17 | - Before running example codes, run 
 18 | 
 19 | .. code-block:: shell
 20 | 
 21 |   pip install -r examples/requirements.txt
 22 | 
 23 | 
 24 | - Download and preprocess data
 25 | 
 26 | .. code-block:: python
 27 |   
 28 |   import os
 29 |   import subprocess
 30 | 
 31 |   import nltk
 32 |   from nltk.tokenize import RegexpTokenizer
 33 |   
 34 |   DOWNLOAD_PATH = "./res"
 35 |   DATASET = "quora-duplicate-questions"
 36 |   DATA_PATH = f"./res/{DATASET}.stream.txt"
 37 |   PROCESSED_DATA_DIR = f"./res/{DATASET}-processed"
 38 | 
 39 |   def preprocess_line(line, tokenizer, lemmatizer):
 40 |     line = line.lower()
 41 |     line = tokenizer.tokenize(line)
 42 |     line = [token for token in line
 43 |             if not token.isnumeric() and len(token) > 1]
 44 |     line = [lemmatizer.lemmatize(token) for token in line]
 45 |     return " ".join(line)
 46 |   
 47 |   # download
 48 |   api.BASE_DIR = DOWNLOAD_PATH
 49 |   filepath = api.load(DATASET, return_path=True)
 50 |   cmd = ["gunzip", "-c", filepath, ">", DATA_PATH]
 51 |   cmd = " ".join(cmd)
 52 |   subprocess.call(cmd, shell=True)
 53 |     
 54 |   # preprocess data
 55 |   tokenizer = RegexpTokenizer(r'\w+')
 56 |   nltk.download("wordnet")
 57 |   lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
 58 |   fout = open(DATA_PATH + ".tmp", "wb")
 59 |   with open(DATA_PATH, "rb") as fin:
 60 |     for line in tqdm.tqdm(fin):
 61 |       line = line.decode("utf8").strip()
 62 |       line = preprocess_line(line, tokenizer, lemmatizer)
 63 |       fout.write((line + "\n").encode("utf8"))
 64 |   fout.close()
 65 |   os.rename(DATA_PATH + ".tmp", DATA_PATH)
 66 | 
 67 | - Train cusim word2vec
 68 | 
 69 | .. code-block:: python
 70 |   
 71 |   from cusim import CuW2V
 72 | 
 73 |   MIN_COUNT = 5
 74 |   LEARNING_RATE = 0.001
 75 |   NEG_SIZE = 10
 76 |   NUM_DIMS = 100
 77 |   CBOW_MEAN = False
 78 |   EPOCHS = 10
 79 |   
 80 |   opt = {
 81 |     "data_path": DATA_PATH,
 82 |     "processed_data_dir": PROCESSED_DATA_DIR,
 83 |     # "skip_preprocess": os.path.exists(PROCESSED_DATA_DIR),
 84 |     "num_dims": NUM_DIMS,
 85 |     "epochs": EPOCHS,
 86 |     "word_min_count": MIN_COUNT,
 87 |     "lr": 0.001,
 88 |     "io": {
 89 |       "lower": False
 90 |     },
 91 |     "neg": 0 if hierarchical_softmax else NEG_SIZE,
 92 |     "skip_gram": skip_gram,
 93 |     "cbow_mean": CBOW_MEAN,
 94 |   }
 95 |   w2v = CuW2V(opt)
 96 |   w2v.train_model()
 97 | 
 98 | 
 99 | - Save and evaluate model
100 | 
101 | .. code-block:: python
102 |   
103 |   import gensim
104 |   from gensim.test.utils import datapath
105 | 
106 |   CUSIM_MODEL = "./res/cusim.w2v.model" 
107 |   
108 |   w2v.save_word2vec_format(CUSIM_MODEL, binary=False)
109 |   model = gensim.models.KeyedVectors.load_word2vec_format(model)
110 |   results = model.wv.evaluate_word_pairs(datapath("wordsim353.tsv"),
111 |                                          case_insensitive=False)
112 | 
113 | Performance
114 | -----------
115 | 
116 | - Data: quora-duplicate-questions from `gensim downloader api <https://radimrehurek.com/gensim/downloader.html#module-gensim.downloader>`_
117 | - skip gram, hierarchical softmax
118 | - Experimented in `AWS g4dn 2xlarge <https://aws.amazon.com/ec2/instance-types/g4/>`_ (One NVIDIA T4 and 8 vcpus of 8 Intel(R) Xeon(R) Platinum 8259CL CPU @ 2.50GHz)
119 | 
120 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+
121 | | attr                |   1 workers (gensim) |   2 workers (gensim) |   4 workers (gensim) |   8 workers (gensim) |   NVIDIA T4 (cusim) |
122 | +=====================+======================+======================+======================+======================+=====================+
123 | | training time (sec) |           892.596    |           544.212    |           310.727    |           226.472    |       **16.162**    |
124 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+
125 | | pearson             |             0.487832 |             0.487696 |             0.482821 |             0.487136 |       **0.492101**  |
126 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+
127 | | spearman            |             0.500846 |             0.506214 |             0.501048 |         **0.506718** |            0.479468 |
128 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+
129 | 
130 | - skip gram, negative sampling
131 | 
132 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+
133 | | attr                |   1 workers (gensim) |   2 workers (gensim) |   4 workers (gensim) |   8 workers (gensim) |   NVIDIA T4 (cusim) |
134 | +=====================+======================+======================+======================+======================+=====================+
135 | | training time (sec) |           586.545    |           340.489    |           220.804    |           146.23     |       **33.9173**   |
136 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+
137 | | pearson             |             0.354448 |             0.353952 |             0.352398 |             0.352925 |        **0.360436** |
138 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+
139 | | spearman            |             0.369146 |             0.369365 |         **0.370565** |             0.365822 |        0.355204     |
140 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+
141 | 
142 | - CBOW, hierarchical softmax
143 | 
144 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+
145 | | attr                |   1 workers (gensim) |   2 workers (gensim) |   4 workers (gensim) |   8 workers (gensim) |   NVIDIA T4 (cusim) |
146 | +=====================+======================+======================+======================+======================+=====================+
147 | | training time (sec) |           250.135    |           155.121    |           103.57     |            73.8073   |        **6.20787**  |
148 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+
149 | | pearson             |             0.309651 |             0.321803 |             0.324854 |             0.314255 |        **0.480298** |
150 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+
151 | | spearman            |             0.294047 |             0.308723 |             0.318293 |             0.300591 |        **0.480971** |
152 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+
153 | 
154 | - CBOW, negative sampling
155 | 
156 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+
157 | | attr                |   1 workers (gensim) |   2 workers (gensim) |   4 workers (gensim) |   8 workers (gensim) |   NVIDIA T4 (cusim) |
158 | +=====================+======================+======================+======================+======================+=====================+
159 | | training time (sec) |           176.923    |           100.369    |            69.7829   |            49.9274   |        **9.90391**  |
160 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+
161 | | pearson             |             0.18772  |             0.193152 |             0.204509 |             0.187924 |        **0.368202** |
162 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+
163 | | spearman            |             0.243975 |             0.24587  |             0.260531 |             0.237441 |        **0.358042** |
164 | +---------------------+----------------------+----------------------+----------------------+----------------------+---------------------+
165 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | ### How to run example code
 2 | 
 3 | 0. install requirements
 4 | 
 5 | ```shell
 6 | pip install -r requirements.txt
 7 | ```
 8 | 
 9 | 1. first, it is good to know about python-fire in https://github.com/google/python-fire, if you haven't heard yet.
10 | 
11 | 2. run w2v experiments on various setting (e.g. skip gram with hierarchical softmax)
12 | 
13 | ```shell
14 | python example_w2v.py run_experiments --sg0=True --hs0=True
15 | ```
16 | 
17 | 3. run lda experiments 
18 | 
19 | ```shell
20 | python example_lda.py run_experiments
21 | ```
22 | 


--------------------------------------------------------------------------------
/examples/cusim.topics.txt:
--------------------------------------------------------------------------------
  1 | ==================================================
  2 | topic 1
  3 | --------------------------------------------------
  4 | rank 1. car: 0.02677285298705101
  5 | rank 2. vehicle: 0.006062767934054136
  6 | rank 3. wheel: 0.005854051560163498
  7 | rank 4. door: 0.0056894212029874325
  8 | rank 5. vehicles: 0.005506897810846567
  9 | rank 6. model: 0.005505426321178675
 10 | rank 7. seat: 0.00544615276157856
 11 | rank 8. zzz_ford: 0.004928195849061012
 12 | rank 9. truck: 0.00481862248852849
 13 | rank 10. front: 0.004714458715170622
 14 | ==================================================
 15 | topic 2
 16 | --------------------------------------------------
 17 | rank 1. priest: 0.020068475976586342
 18 | rank 2. church: 0.018575558438897133
 19 | rank 3. abuse: 0.014300045557320118
 20 | rank 4. sexual: 0.012478752993047237
 21 | rank 5. information: 0.011768681928515434
 22 | rank 6. bishop: 0.010295535437762737
 23 | rank 7. privacy: 0.00979470182210207
 24 | rank 8. enditalic: 0.007282644044607878
 25 | rank 9. zzz_government: 0.007169242016971111
 26 | rank 10. beginitalic: 0.007022677455097437
 27 | ==================================================
 28 | topic 3
 29 | --------------------------------------------------
 30 | rank 1. scientist: 0.012924057431519032
 31 | rank 2. plant: 0.010201558470726013
 32 | rank 3. animal: 0.00955168902873993
 33 | rank 4. human: 0.006574922241270542
 34 | rank 5. water: 0.006187545135617256
 35 | rank 6. species: 0.005247119814157486
 36 | rank 7. science: 0.003869544016197324
 37 | rank 8. research: 0.0037548812106251717
 38 | rank 9. chemical: 0.0036675187293440104
 39 | rank 10. researcher: 0.003376629902049899
 40 | ==================================================
 41 | topic 4
 42 | --------------------------------------------------
 43 | rank 1. room: 0.0097695617005229
 44 | rank 2. building: 0.009012440219521523
 45 | rank 3. hotel: 0.007701032795011997
 46 | rank 4. town: 0.007017012685537338
 47 | rank 5. visitor: 0.005999790038913488
 48 | rank 6. park: 0.004900350235402584
 49 | rank 7. water: 0.00483303889632225
 50 | rank 8. restaurant: 0.004804808646440506
 51 | rank 9. tour: 0.004689469002187252
 52 | rank 10. house: 0.004657984711229801
 53 | ==================================================
 54 | topic 5
 55 | --------------------------------------------------
 56 | rank 1. executive: 0.010459953919053078
 57 | rank 2. president: 0.009247427806258202
 58 | rank 3. chief: 0.007263457868248224
 59 | rank 4. deal: 0.006793353706598282
 60 | rank 5. media: 0.006781542673707008
 61 | rank 6. zzz_u_s: 0.006448546424508095
 62 | rank 7. question: 0.006310692522674799
 63 | rank 8. public: 0.0058334809727966785
 64 | rank 9. client: 0.0057419463992118835
 65 | rank 10. com: 0.005699603818356991
 66 | ==================================================
 67 | topic 6
 68 | --------------------------------------------------
 69 | rank 1. official: 0.010707475244998932
 70 | rank 2. zzz_new_york: 0.010631673038005829
 71 | rank 3. building: 0.006501882337033749
 72 | rank 4. found: 0.005982889328151941
 73 | rank 5. worker: 0.005903987213969231
 74 | rank 6. officer: 0.00562720000743866
 75 | rank 7. hour: 0.005579715128988028
 76 | rank 8. security: 0.0047438195906579494
 77 | rank 9. plane: 0.004530096426606178
 78 | rank 10. attack: 0.0045171682722866535
 79 | ==================================================
 80 | topic 7
 81 | --------------------------------------------------
 82 | rank 1. gold: 0.008730198256671429
 83 | rank 2. hour: 0.00799502432346344
 84 | rank 3. floor: 0.007750170771032572
 85 | rank 4. medal: 0.005479565821588039
 86 | rank 5. rider: 0.005427593365311623
 87 | rank 6. ice: 0.005187307950109243
 88 | rank 7. event: 0.004164813086390495
 89 | rank 8. silver: 0.00394535344094038
 90 | rank 9. hand: 0.003944935742765665
 91 | rank 10. moment: 0.003745123278349638
 92 | ==================================================
 93 | topic 8
 94 | --------------------------------------------------
 95 | rank 1. customer: 0.020594391971826553
 96 | rank 2. product: 0.01662433333694935
 97 | rank 3. weather: 0.010188293643295765
 98 | rank 4. stores: 0.009588934481143951
 99 | rank 5. marketing: 0.007573566399514675
100 | rank 6. consumer: 0.007247460074722767
101 | rank 7. need: 0.00708211213350296
102 | rank 8. business: 0.006656122859567404
103 | rank 9. problem: 0.006193865556269884
104 | rank 10. sales: 0.00576401362195611
105 | ==================================================
106 | topic 9
107 | --------------------------------------------------
108 | rank 1. zzz_enron: 0.0346914604306221
109 | rank 2. anthrax: 0.018805652856826782
110 | rank 3. firm: 0.017304280772805214
111 | rank 4. employees: 0.013712462969124317
112 | rank 5. accounting: 0.011462894268333912
113 | rank 6. company: 0.010516936890780926
114 | rank 7. letter: 0.009165323339402676
115 | rank 8. zzz_arthur_andersen: 0.008399050682783127
116 | rank 9. financial: 0.006972334813326597
117 | rank 10. official: 0.006733026821166277
118 | ==================================================
119 | topic 10
120 | --------------------------------------------------
121 | rank 1. game: 0.02054956741631031
122 | rank 2. yard: 0.01949656568467617
123 | rank 3. season: 0.018450269475579262
124 | rank 4. play: 0.01595749706029892
125 | rank 5. team: 0.014850640669465065
126 | rank 6. coach: 0.012072306126356125
127 | rank 7. football: 0.010657819919288158
128 | rank 8. player: 0.010432523675262928
129 | rank 9. zzz_nfl: 0.009206585586071014
130 | rank 10. defensive: 0.008976943790912628
131 | ==================================================
132 | topic 11
133 | --------------------------------------------------
134 | rank 1. con: 0.020736297592520714
135 | rank 2. una: 0.013567320071160793
136 | rank 3. las: 0.01041751354932785
137 | rank 4. mas: 0.010156860575079918
138 | rank 5. dice: 0.009438637644052505
139 | rank 6. por: 0.00928747933357954
140 | rank 7. como: 0.008855272084474564
141 | rank 8. los: 0.008734958246350288
142 | rank 9. zzz_argentina: 0.0077548702247440815
143 | rank 10. anos: 0.0052759042009711266
144 | ==================================================
145 | topic 12
146 | --------------------------------------------------
147 | rank 1. zzz_afghanistan: 0.02263963408768177
148 | rank 2. zzz_taliban: 0.019689183682203293
149 | rank 3. military: 0.014852428808808327
150 | rank 4. bin: 0.014605461619794369
151 | rank 5. laden: 0.014458988793194294
152 | rank 6. war: 0.01199477817863226
153 | rank 7. zzz_pakistan: 0.01184108667075634
154 | rank 8. terrorist: 0.011557201854884624
155 | rank 9. zzz_u_s: 0.01051971036940813
156 | rank 10. attack: 0.009562982246279716
157 | ==================================================
158 | topic 13
159 | --------------------------------------------------
160 | rank 1. court: 0.02521500550210476
161 | rank 2. case: 0.023994332179427147
162 | rank 3. lawyer: 0.019229630008339882
163 | rank 4. trial: 0.012606462463736534
164 | rank 5. attorney: 0.011963741853833199
165 | rank 6. law: 0.010776755400002003
166 | rank 7. prosecutor: 0.010139403864741325
167 | rank 8. judge: 0.010069739073514938
168 | rank 9. federal: 0.01000827457755804
169 | rank 10. charges: 0.009131026454269886
170 | ==================================================
171 | topic 14
172 | --------------------------------------------------
173 | rank 1. children: 0.024856505915522575
174 | rank 2. family: 0.023518990725278854
175 | rank 3. mother: 0.021585773676633835
176 | rank 4. parent: 0.018566781654953957
177 | rank 5. father: 0.017965450882911682
178 | rank 6. child: 0.016640648245811462
179 | rank 7. son: 0.014798246324062347
180 | rank 8. boy: 0.013485484756529331
181 | rank 9. girl: 0.01209142617881298
182 | rank 10. daughter: 0.011482957750558853
183 | ==================================================
184 | topic 15
185 | --------------------------------------------------
186 | rank 1. home: 0.00700838677585125
187 | rank 2. run: 0.006053796038031578
188 | rank 3. right: 0.005981859751045704
189 | rank 4. left: 0.005203519947826862
190 | rank 5. part: 0.005086812656372786
191 | rank 6. night: 0.004532037302851677
192 | rank 7. put: 0.004220300819724798
193 | rank 8. took: 0.003913923632353544
194 | rank 9. called: 0.003663261653855443
195 | rank 10. early: 0.0034683081321418285
196 | ==================================================
197 | topic 16
198 | --------------------------------------------------
199 | rank 1. computer: 0.05519622564315796
200 | rank 2. system: 0.038603898137807846
201 | rank 3. zzz_microsoft: 0.0243679191917181
202 | rank 4. software: 0.02125958725810051
203 | rank 5. technology: 0.016031846404075623
204 | rank 6. window: 0.015655480325222015
205 | rank 7. mail: 0.01430702954530716
206 | rank 8. user: 0.011626251973211765
207 | rank 9. information: 0.010091814212501049
208 | rank 10. operating: 0.00756523571908474
209 | ==================================================
210 | topic 17
211 | --------------------------------------------------
212 | rank 1. law: 0.02723626047372818
213 | rank 2. right: 0.013789551332592964
214 | rank 3. political: 0.012496591545641422
215 | rank 4. government: 0.012413491494953632
216 | rank 5. religious: 0.01058514229953289
217 | rank 6. immigrant: 0.010227411054074764
218 | rank 7. power: 0.008888700045645237
219 | rank 8. ruling: 0.006956734228879213
220 | rank 9. court: 0.006303566973656416
221 | rank 10. opposition: 0.006150286644697189
222 | ==================================================
223 | topic 18
224 | --------------------------------------------------
225 | rank 1. driver: 0.026438318192958832
226 | rank 2. car: 0.021013904362916946
227 | rank 3. race: 0.02072136662900448
228 | rank 4. racing: 0.013081525452435017
229 | rank 5. airline: 0.012061871588230133
230 | rank 6. flight: 0.009761194698512554
231 | rank 7. track: 0.008779071271419525
232 | rank 8. races: 0.007440405432134867
233 | rank 9. airlines: 0.00735550606623292
234 | rank 10. carrier: 0.0064879427663981915
235 | ==================================================
236 | topic 19
237 | --------------------------------------------------
238 | rank 1. zzz_bush: 0.018247155472636223
239 | rank 2. official: 0.015417532064020634
240 | rank 3. zzz_united_states: 0.01530768908560276
241 | rank 4. administration: 0.013708231039345264
242 | rank 5. leader: 0.010346359573304653
243 | rank 6. countries: 0.009353779256343842
244 | rank 7. zzz_u_s: 0.009245852008461952
245 | rank 8. government: 0.009168907068669796
246 | rank 9. zzz_iraq: 0.009057393297553062
247 | rank 10. military: 0.008723369799554348
248 | ==================================================
249 | topic 20
250 | --------------------------------------------------
251 | rank 1. percent: 0.044513553380966187
252 | rank 2. stock: 0.023978371173143387
253 | rank 3. market: 0.022495266050100327
254 | rank 4. fund: 0.013825907371938229
255 | rank 5. billion: 0.012179437093436718
256 | rank 6. quarter: 0.010966183617711067
257 | rank 7. investor: 0.01015525683760643
258 | rank 8. investment: 0.009771433658897877
259 | rank 9. million: 0.009703823365271091
260 | rank 10. analyst: 0.00947034452110529
261 | ==================================================
262 | topic 21
263 | --------------------------------------------------
264 | rank 1. book: 0.02586548589169979
265 | rank 2. art: 0.009416724555194378
266 | rank 3. artist: 0.007856832817196846
267 | rank 4. collection: 0.007611869368702173
268 | rank 5. painting: 0.0066984654404222965
269 | rank 6. fashion: 0.005222611129283905
270 | rank 7. century: 0.005118153523653746
271 | rank 8. writer: 0.004741827957332134
272 | rank 9. designer: 0.004720605909824371
273 | rank 10. author: 0.004426541272550821
274 | ==================================================
275 | topic 22
276 | --------------------------------------------------
277 | rank 1. music: 0.03731286898255348
278 | rank 2. song: 0.023323602974414825
279 | rank 3. cell: 0.015249419026076794
280 | rank 4. album: 0.011770840734243393
281 | rank 5. band: 0.011705778539180756
282 | rank 6. musical: 0.008127620443701744
283 | rank 7. singer: 0.006815788336098194
284 | rank 8. concert: 0.006784858647733927
285 | rank 9. jazz: 0.006698825396597385
286 | rank 10. sound: 0.006471691187471151
287 | ==================================================
288 | topic 23
289 | --------------------------------------------------
290 | rank 1. web: 0.04922621324658394
291 | rank 2. site: 0.03805321082472801
292 | rank 3. www: 0.03708707541227341
293 | rank 4. com: 0.03255585581064224
294 | rank 5. online: 0.027454305440187454
295 | rank 6. zzz_internet: 0.019746430218219757
296 | rank 7. sites: 0.018789643421769142
297 | rank 8. information: 0.012109276838600636
298 | rank 9. mail: 0.010703440755605698
299 | rank 10. internet: 0.010465497151017189
300 | ==================================================
301 | topic 24
302 | --------------------------------------------------
303 | rank 1. cup: 0.013092475943267345
304 | rank 2. food: 0.011349334381520748
305 | rank 3. minutes: 0.008257454261183739
306 | rank 4. add: 0.007631846237927675
307 | rank 5. tablespoon: 0.006674299016594887
308 | rank 6. oil: 0.006410549394786358
309 | rank 7. pepper: 0.005671842489391565
310 | rank 8. sugar: 0.005601006560027599
311 | rank 9. teaspoon: 0.005426750052720308
312 | rank 10. water: 0.005266525782644749
313 | ==================================================
314 | topic 25
315 | --------------------------------------------------
316 | rank 1. team: 0.03806942701339722
317 | rank 2. season: 0.0169094055891037
318 | rank 3. games: 0.014860374853014946
319 | rank 4. zzz_olympic: 0.013387414626777172
320 | rank 5. coach: 0.011522733606398106
321 | rank 6. zzz_miami: 0.00958396214991808
322 | rank 7. athletes: 0.009186827577650547
323 | rank 8. player: 0.00904573779553175
324 | rank 9. football: 0.008782983757555485
325 | rank 10. defense: 0.007022276986390352
326 | ==================================================
327 | topic 26
328 | --------------------------------------------------
329 | rank 1. zzz_united_states: 0.00901876762509346
330 | rank 2. zzz_american: 0.008501513861119747
331 | rank 3. american: 0.008129569701850414
332 | rank 4. country: 0.006140291225165129
333 | rank 5. government: 0.005337539594620466
334 | rank 6. group: 0.005324787925928831
335 | rank 7. german: 0.0052407230250537395
336 | rank 8. history: 0.004972025752067566
337 | rank 9. french: 0.0047142705880105495
338 | rank 10. family: 0.004625052213668823
339 | ==================================================
340 | topic 27
341 | --------------------------------------------------
342 | rank 1. women: 0.07863874733448029
343 | rank 2. gay: 0.020628171041607857
344 | rank 3. dog: 0.014878431335091591
345 | rank 4. magazine: 0.01347420085221529
346 | rank 5. woman: 0.012085708789527416
347 | rank 6. sex: 0.009264894761145115
348 | rank 7. female: 0.008259394206106663
349 | rank 8. cat: 0.006200404372066259
350 | rank 9. male: 0.0057057044468820095
351 | rank 10. lesbian: 0.0040387725457549095
352 | ==================================================
353 | topic 28
354 | --------------------------------------------------
355 | rank 1. digital: 0.011757075786590576
356 | rank 2. screen: 0.0080463457852602
357 | rank 3. wine: 0.007102092728018761
358 | rank 4. device: 0.006819858215749264
359 | rank 5. wines: 0.0068092974834144115
360 | rank 6. chip: 0.006679498124867678
361 | rank 7. computer: 0.006480266340076923
362 | rank 8. devices: 0.005909178406000137
363 | rank 9. electronic: 0.0056115672923624516
364 | rank 10. images: 0.004710317123681307
365 | ==================================================
366 | topic 29
367 | --------------------------------------------------
368 | rank 1. campaign: 0.03327873349189758
369 | rank 2. political: 0.014918365515768528
370 | rank 3. democratic: 0.014790846966207027
371 | rank 4. election: 0.014583878219127655
372 | rank 5. republican: 0.014538025483489037
373 | rank 6. voter: 0.01402147114276886
374 | rank 7. zzz_al_gore: 0.013029148802161217
375 | rank 8. zzz_party: 0.012214157730340958
376 | rank 9. zzz_republican: 0.011119640432298183
377 | rank 10. candidates: 0.010824044235050678
378 | ==================================================
379 | topic 30
380 | --------------------------------------------------
381 | rank 1. school: 0.03626062348484993
382 | rank 2. student: 0.021992284804582596
383 | rank 3. black: 0.015230956487357616
384 | rank 4. group: 0.013538197614252567
385 | rank 5. public: 0.010991621762514114
386 | rank 6. percent: 0.010974901728332043
387 | rank 7. zzz_texas: 0.008697726763784885
388 | rank 8. gun: 0.007661579176783562
389 | rank 9. member: 0.0075561245903372765
390 | rank 10. white: 0.007528342306613922
391 | ==================================================
392 | topic 31
393 | --------------------------------------------------
394 | rank 1. zzz_fbi: 0.025642145425081253
395 | rank 2. fish: 0.020048771053552628
396 | rank 3. bird: 0.013764469884335995
397 | rank 4. agent: 0.011454230174422264
398 | rank 5. irish: 0.009724821895360947
399 | rank 6. fishing: 0.00831819698214531
400 | rank 7. zzz_timothy_mcveigh: 0.006179510150104761
401 | rank 8. zzz_brazil: 0.006174848414957523
402 | rank 9. hijacker: 0.0060051921755075455
403 | rank 10. zzz_simon: 0.005628513637930155
404 | ==================================================
405 | topic 32
406 | --------------------------------------------------
407 | rank 1. company: 0.07715368270874023
408 | rank 2. companies: 0.033467356115579605
409 | rank 3. business: 0.019932780414819717
410 | rank 4. million: 0.01110815443098545
411 | rank 5. deal: 0.01099175214767456
412 | rank 6. executives: 0.010963932611048222
413 | rank 7. executive: 0.010428434237837791
414 | rank 8. market: 0.0098022585734725
415 | rank 9. stock: 0.009284550324082375
416 | rank 10. chief: 0.008711854927241802
417 | ==================================================
418 | topic 33
419 | --------------------------------------------------
420 | rank 1. consumer: 0.02195882610976696
421 | rank 2. percent: 0.020870916545391083
422 | rank 3. companies: 0.015635766088962555
423 | rank 4. industry: 0.015347079373896122
424 | rank 5. market: 0.014645704068243504
425 | rank 6. cost: 0.012568947859108448
426 | rank 7. customer: 0.012199653312563896
427 | rank 8. prices: 0.010143699124455452
428 | rank 9. high: 0.009660380892455578
429 | rank 10. worker: 0.006465692073106766
430 | ==================================================
431 | topic 34
432 | --------------------------------------------------
433 | rank 1. season: 0.021334033459424973
434 | rank 2. team: 0.016839321702718735
435 | rank 3. game: 0.014815553091466427
436 | rank 4. inning: 0.014347057789564133
437 | rank 5. player: 0.013774506747722626
438 | rank 6. yankees: 0.011174232698976994
439 | rank 7. run: 0.010817022994160652
440 | rank 8. baseball: 0.01055373065173626
441 | rank 9. games: 0.010321191512048244
442 | rank 10. hit: 0.010284436866641045
443 | ==================================================
444 | topic 35
445 | --------------------------------------------------
446 | rank 1. zzz_george_bush: 0.05796745792031288
447 | rank 2. zzz_al_gore: 0.04237228259444237
448 | rank 3. election: 0.022491727024316788
449 | rank 4. president: 0.020312432199716568
450 | rank 5. ballot: 0.019908472895622253
451 | rank 6. zzz_florida: 0.016183944419026375
452 | rank 7. presidential: 0.015332216396927834
453 | rank 8. votes: 0.01442129909992218
454 | rank 9. vote: 0.009808804839849472
455 | rank 10. zzz_bush: 0.00961968582123518
456 | ==================================================
457 | topic 36
458 | --------------------------------------------------
459 | rank 1. palestinian: 0.02687947452068329
460 | rank 2. zzz_israel: 0.023833250626921654
461 | rank 3. zzz_israeli: 0.013304143212735653
462 | rank 4. soldier: 0.010826818645000458
463 | rank 5. peace: 0.010164049454033375
464 | rank 6. zzz_yasser_arafat: 0.009658769704401493
465 | rank 7. israeli: 0.009265914559364319
466 | rank 8. war: 0.00923923496156931
467 | rank 9. israelis: 0.008119330741465092
468 | rank 10. military: 0.007811776362359524
469 | ==================================================
470 | topic 37
471 | --------------------------------------------------
472 | rank 1. death: 0.023664837703108788
473 | rank 2. prison: 0.016880618408322334
474 | rank 3. murder: 0.01633421890437603
475 | rank 4. book: 0.009351547807455063
476 | rank 5. killed: 0.009010221809148788
477 | rank 6. prisoner: 0.007692103274166584
478 | rank 7. killing: 0.007337935268878937
479 | rank 8. woman: 0.007256744429469109
480 | rank 9. victim: 0.007001840975135565
481 | rank 10. shooting: 0.006456068251281977
482 | ==================================================
483 | topic 38
484 | --------------------------------------------------
485 | rank 1. million: 0.01617966778576374
486 | rank 2. newspaper: 0.009461159817874432
487 | rank 3. show: 0.006403861101716757
488 | rank 4. program: 0.005598483607172966
489 | rank 5. network: 0.0053542195819318295
490 | rank 6. money: 0.00485030934214592
491 | rank 7. according: 0.004323051776736975
492 | rank 8. special: 0.0040418170392513275
493 | rank 9. help: 0.004037346225231886
494 | rank 10. past: 0.0039222449995577335
495 | ==================================================
496 | topic 39
497 | --------------------------------------------------
498 | rank 1. show: 0.022530050948262215
499 | rank 2. character: 0.009580017998814583
500 | rank 3. audience: 0.005444356705993414
501 | rank 4. television: 0.004325090907514095
502 | rank 5. series: 0.004303744062781334
503 | rank 6. look: 0.004119543824344873
504 | rank 7. love: 0.00407353974878788
505 | rank 8. film: 0.004058054182678461
506 | rank 9. find: 0.003848094493150711
507 | rank 10. young: 0.0036786773707717657
508 | ==================================================
509 | topic 40
510 | --------------------------------------------------
511 | rank 1. drug: 0.047516606748104095
512 | rank 2. government: 0.012602291069924831
513 | rank 3. zzz_aid: 0.01227615773677826
514 | rank 4. zzz_india: 0.010664834640920162
515 | rank 5. countries: 0.008103608153760433
516 | rank 6. million: 0.007103894371539354
517 | rank 7. food: 0.006576470099389553
518 | rank 8. farmer: 0.006402278784662485
519 | rank 9. country: 0.006317282561212778
520 | rank 10. zzz_united_states: 0.0062563237734138966
521 | ==================================================
522 | topic 41
523 | --------------------------------------------------
524 | rank 1. game: 0.026529431343078613
525 | rank 2. player: 0.022719431668519974
526 | rank 3. games: 0.0206462275236845
527 | rank 4. sport: 0.016915155574679375
528 | rank 5. fan: 0.012125855311751366
529 | rank 6. soccer: 0.011505456641316414
530 | rank 7. video: 0.010653939098119736
531 | rank 8. zzz_nbc: 0.009938360191881657
532 | rank 9. zzz_nba: 0.009428229182958603
533 | rank 10. team: 0.008263841271400452
534 | ==================================================
535 | topic 42
536 | --------------------------------------------------
537 | rank 1. tax: 0.04971655085682869
538 | rank 2. cut: 0.026394149288535118
539 | rank 3. economy: 0.0230980534106493
540 | rank 4. economic: 0.017415864393115044
541 | rank 5. zzz_mexico: 0.01618388667702675
542 | rank 6. government: 0.01595328189432621
543 | rank 7. taxes: 0.014780825935304165
544 | rank 8. spending: 0.01243556011468172
545 | rank 9. income: 0.012374772690236568
546 | rank 10. zzz_social_security: 0.010477164760231972
547 | ==================================================
548 | topic 43
549 | --------------------------------------------------
550 | rank 1. zzz_bush: 0.027270827442407608
551 | rank 2. bill: 0.024806691333651543
552 | rank 3. zzz_congress: 0.018335092812776566
553 | rank 4. zzz_white_house: 0.016858264803886414
554 | rank 5. federal: 0.01354345865547657
555 | rank 6. zzz_senate: 0.01329002995043993
556 | rank 7. plan: 0.012937983497977257
557 | rank 8. proposal: 0.010213974863290787
558 | rank 9. administration: 0.009349077008664608
559 | rank 10. health: 0.008263114839792252
560 | ==================================================
561 | topic 44
562 | --------------------------------------------------
563 | rank 1. point: 0.020692508667707443
564 | rank 2. team: 0.018113387748599052
565 | rank 3. game: 0.015103872865438461
566 | rank 4. season: 0.013727625831961632
567 | rank 5. play: 0.012306117452681065
568 | rank 6. goal: 0.012093267403542995
569 | rank 7. games: 0.011415580287575722
570 | rank 8. shot: 0.011306485161185265
571 | rank 9. king: 0.011238034814596176
572 | rank 10. player: 0.008728481829166412
573 | ==================================================
574 | topic 45
575 | --------------------------------------------------
576 | rank 1. player: 0.013769405893981457
577 | rank 2. point: 0.012727474793791771
578 | rank 3. win: 0.012649298645555973
579 | rank 4. play: 0.011700315400958061
580 | rank 5. round: 0.010591110214591026
581 | rank 6. season: 0.010317614302039146
582 | rank 7. shot: 0.01031588576734066
583 | rank 8. game: 0.00999273732304573
584 | rank 9. team: 0.009904314763844013
585 | rank 10. final: 0.009542282670736313
586 | ==================================================
587 | topic 46
588 | --------------------------------------------------
589 | rank 1. zzz_china: 0.015810564160346985
590 | rank 2. oil: 0.014123033732175827
591 | rank 3. power: 0.013019545003771782
592 | rank 4. zzz_russia: 0.012522333301603794
593 | rank 5. energy: 0.01063102949410677
594 | rank 6. plant: 0.010357524268329144
595 | rank 7. gas: 0.00931472983211279
596 | rank 8. nuclear: 0.008214462548494339
597 | rank 9. missile: 0.007829232141375542
598 | rank 10. environmental: 0.007554346229881048
599 | ==================================================
600 | topic 47
601 | --------------------------------------------------
602 | rank 1. com: 0.02512955479323864
603 | rank 2. zzz_laker: 0.015019885264337063
604 | rank 3. palm: 0.013598510064184666
605 | rank 4. daily: 0.013184287585318089
606 | rank 5. statesman: 0.013182769529521465
607 | rank 6. beach: 0.01314060389995575
608 | rank 7. question: 0.010342201218008995
609 | rank 8. zzz_eastern: 0.009052561596035957
610 | rank 9. information: 0.008214504458010197
611 | rank 10. austin: 0.007981293834745884
612 | ==================================================
613 | topic 48
614 | --------------------------------------------------
615 | rank 1. film: 0.034848302602767944
616 | rank 2. movie: 0.02526075392961502
617 | rank 3. actor: 0.013231894932687283
618 | rank 4. movies: 0.008959283120930195
619 | rank 5. zzz_hollywood: 0.008070441894233227
620 | rank 6. play: 0.007740044500678778
621 | rank 7. theater: 0.00727312033995986
622 | rank 8. director: 0.005834080744534731
623 | rank 9. character: 0.005199376493692398
624 | rank 10. zzz_oscar: 0.004690317437052727
625 | ==================================================
626 | topic 49
627 | --------------------------------------------------
628 | rank 1. patient: 0.02304932475090027
629 | rank 2. doctor: 0.01952706277370453
630 | rank 3. cancer: 0.011629555374383926
631 | rank 4. medical: 0.011445121839642525
632 | rank 5. disease: 0.011433145962655544
633 | rank 6. hospital: 0.009982189163565636
634 | rank 7. study: 0.008990893140435219
635 | rank 8. treatment: 0.007559608668088913
636 | rank 9. blood: 0.007204002235084772
637 | rank 10. test: 0.007000159937888384
638 | ==================================================
639 | topic 50
640 | --------------------------------------------------
641 | rank 1. million: 0.028744814917445183
642 | rank 2. contract: 0.016937075182795525
643 | rank 3. agent: 0.009414087980985641
644 | rank 4. manager: 0.007703984156250954
645 | rank 5. business: 0.006961227394640446
646 | rank 6. high: 0.005569536704570055
647 | rank 7. club: 0.005377542693167925
648 | rank 8. past: 0.005371585488319397
649 | rank 9. career: 0.005363883450627327
650 | rank 10. hand: 0.005337761249393225
651 | 


--------------------------------------------------------------------------------
/examples/example_lda.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 Jisang Yoon
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the Apache 2.0 license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | # pylint: disable=no-name-in-module,logging-format-truncated
  8 | # pylint: disable=too-few-public-methods
  9 | import os
 10 | from os.path import join as pjoin
 11 | import time
 12 | import pickle
 13 | import subprocess
 14 | 
 15 | import tqdm
 16 | import fire
 17 | import wget
 18 | import h5py
 19 | import numpy as np
 20 | import pandas as pd
 21 | 
 22 | # import gensim
 23 | from gensim.models.ldamulticore import LdaMulticore
 24 | 
 25 | from cusim import aux, CuLDA
 26 | 
 27 | LOGGER = aux.get_logger()
 28 | # DATASET = "nips"
 29 | DATASET = "nytimes"
 30 | DIR_PATH = "./res"
 31 | BASE_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/" \
 32 |            "bag-of-words/"
 33 | 
 34 | def download():
 35 |   if not os.path.exists(DIR_PATH):
 36 |     os.makedirs(DIR_PATH, exist_ok=True)
 37 | 
 38 |   if os.path.exists(pjoin(DIR_PATH, f"docword.{DATASET}.txt")):
 39 |     LOGGER.info("path %s already exists",
 40 |                 pjoin(DIR_PATH, f"docword.{DATASET}.txt"))
 41 |     return
 42 | 
 43 |   # download docword
 44 |   filename = f"docword.{DATASET}.txt.gz"
 45 |   out_path = pjoin(DIR_PATH, filename)
 46 |   LOGGER.info("download %s to %s", BASE_URL + filename, out_path)
 47 |   wget.download(BASE_URL + filename, out=out_path)
 48 |   print()
 49 | 
 50 |   # decompress
 51 |   cmd = ["gunzip", "-c", out_path, ">",
 52 |          pjoin(DIR_PATH, f"docword.{DATASET}.txt")]
 53 |   cmd = " ".join(cmd)
 54 |   subprocess.call(cmd, shell=True)
 55 |   os.remove(pjoin(DIR_PATH, filename))
 56 | 
 57 |   # download vocab
 58 |   filename = f"vocab.{DATASET}.txt"
 59 |   out_path = pjoin(DIR_PATH, filename)
 60 |   LOGGER.info("download %s to %s", BASE_URL + filename, out_path)
 61 |   wget.download(BASE_URL + filename, out=out_path)
 62 |   print()
 63 | 
 64 | def run_cusim():
 65 |   download()
 66 |   data_path = pjoin(DIR_PATH, f"docword.{DATASET}.txt")
 67 |   keys_path = pjoin(DIR_PATH, f"vocab.{DATASET}.txt")
 68 |   processed_data_path = pjoin(DIR_PATH, f"docword.{DATASET}.h5")
 69 |   opt = {
 70 |     "data_path": data_path,
 71 |     "processed_data_path": processed_data_path,
 72 |     "keys_path": keys_path,
 73 |     "num_topics": 50,
 74 |     "num_iters_in_e_step": 10,
 75 |     "reuse_gamma": True,
 76 |     # "skip_preprocess": os.path.exists(processed_data_path),
 77 |   }
 78 |   start = time.time()
 79 |   lda = CuLDA(opt)
 80 |   lda.train_model()
 81 |   el0 = time.time() - start
 82 |   LOGGER.info("elapsed for training LDA using cusim: %.4e sec", el0)
 83 |   h5_model_path = pjoin(DIR_PATH, "cusim.lda.model.h5")
 84 |   lda.save_h5_model(h5_model_path)
 85 |   show_cusim_topics(h5_model_path)
 86 |   return el0
 87 | 
 88 | def show_cusim_topics(h5_model_path, topk=10):
 89 |   h5f = h5py.File(h5_model_path, "r")
 90 |   beta = h5f["beta"][:, :].T
 91 |   keys = h5f["keys"][:]
 92 |   show_topics(beta, keys, topk, "cusim.topics.txt")
 93 | 
 94 | def build_gensim_corpus():
 95 |   corpus_path = pjoin(DIR_PATH, f"docword.{DATASET}.pk")
 96 |   if os.path.exists(corpus_path):
 97 |     LOGGER.info("load corpus from %s", corpus_path)
 98 |     with open(corpus_path, "rb") as fin:
 99 |       ret = pickle.loads(fin.read())
100 |     return ret
101 | 
102 |   # get corpus for gensim lda
103 |   data_path = pjoin(DIR_PATH, f"docword.{DATASET}.txt")
104 |   LOGGER.info("build corpus from %s", data_path)
105 |   docs, doc, curid = [], [], -1
106 |   with open(data_path, "r") as fin:
107 |     for idx, line in tqdm.tqdm(enumerate(fin)):
108 |       if idx < 3:
109 |         continue
110 |       docid, wordid, count = line.strip().split()
111 |       # zero-base id
112 |       docid, wordid, count = int(docid) - 1, int(wordid) - 1, float(count)
113 |       if 0 <= curid < docid:
114 |         docs.append(doc)
115 |         doc = []
116 |       doc.append((wordid, count))
117 |       curid = docid
118 |     docs.append(doc)
119 |   LOGGER.info("save corpus to %s", corpus_path)
120 |   with open(corpus_path, "wb") as fout:
121 |     fout.write(pickle.dumps(docs, 2))
122 |   return docs
123 | 
124 | def run_gensim():
125 |   docs = build_gensim_corpus()
126 |   keys_path = pjoin(DIR_PATH, f"vocab.{DATASET}.txt")
127 |   LOGGER.info("load vocab from %s", keys_path)
128 |   id2word = {}
129 |   with open(keys_path, "rb") as fin:
130 |     for idx, line in enumerate(fin):
131 |       id2word[idx] = line.strip()
132 | 
133 |   start = time.time()
134 |   lda = LdaMulticore(docs, num_topics=50, workers=None,
135 |                     id2word=id2word, iterations=10)
136 |   el0 = time.time() - start
137 |   LOGGER.info("elapsed for training lda using gensim: %.4e sec", el0)
138 |   model_path = pjoin(DIR_PATH, "gensim.lda.model")
139 |   LOGGER.info("save gensim lda model to %s", model_path)
140 |   lda.save(model_path)
141 |   show_gensim_topics(model_path)
142 |   return el0
143 | 
144 | def show_gensim_topics(model_path=None, topk=10):
145 |   # load beta
146 |   model_path = model_path or pjoin(DIR_PATH, "gensim.lda.model")
147 |   LOGGER.info("load gensim lda model from %s", model_path)
148 |   lda = LdaMulticore.load(model_path)
149 |   beta = lda.state.get_lambda()
150 |   beta /= np.sum(beta, axis=1)[:, None]
151 | 
152 |   # load keys
153 |   keys_path = pjoin(DIR_PATH, f"vocab.{DATASET}.txt")
154 |   LOGGER.info("load vocab from %s", keys_path)
155 |   with open(keys_path, "rb") as fin:
156 |     keys = [line.strip() for line in fin]
157 |   show_topics(beta, keys, topk, "gensim.topics.txt")
158 | 
159 | def show_topics(beta, keys, topk, result_path):
160 |   LOGGER.info("save results to %s (topk: %d)", result_path, topk)
161 |   fout = open(result_path, "w")
162 |   for idx in range(beta.shape[0]):
163 |     print("=" * 50)
164 |     fout.write("=" * 50 + "\n")
165 |     print(f"topic {idx + 1}")
166 |     fout.write(f"topic {idx + 1}" + "\n")
167 |     print("-" * 50)
168 |     fout.write("-" * 50 + "\n")
169 |     _beta = beta[idx, :]
170 |     indices = np.argsort(-_beta)[:topk]
171 |     for rank, wordid in enumerate(indices):
172 |       word = keys[wordid].decode("utf8")
173 |       prob = _beta[wordid]
174 |       print(f"rank {rank + 1}. {word}: {prob}")
175 |       fout.write(f"rank {rank + 1}. {word}: {prob}" + "\n")
176 |   fout.close()
177 | 
178 | 
179 | def run_experiments():
180 |   training_time = {"attr": "training time (sec)"}
181 |   training_time["gensim (8 vpus)"] = run_gensim()
182 |   training_time["cusim"] = run_cusim()
183 |   df0 = pd.DataFrame([training_time])
184 |   df0.set_index("attr", inplace=True)
185 |   print(df0.to_markdown())
186 | 
187 | 
188 | if __name__ == "__main__":
189 |   fire.Fire()
190 | 


--------------------------------------------------------------------------------
/examples/example_w2v.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 Jisang Yoon
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the Apache 2.0 license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | # pylint: disable=no-name-in-module,logging-format-truncated
  8 | # pylint: disable=too-few-public-methods
  9 | import os
 10 | import time
 11 | import subprocess
 12 | 
 13 | import tqdm
 14 | import fire
 15 | import pandas as pd
 16 | 
 17 | import gensim
 18 | from gensim import downloader as api
 19 | from gensim.test.utils import datapath
 20 | 
 21 | import nltk
 22 | from nltk.tokenize import RegexpTokenizer
 23 | 
 24 | from cusim import aux, CuW2V
 25 | 
 26 | 
 27 | LOGGER = aux.get_logger()
 28 | DOWNLOAD_PATH = "./res"
 29 | DATASET = "quora-duplicate-questions"
 30 | DATA_PATH = f"./res/{DATASET}.stream.txt"
 31 | PROCESSED_DATA_DIR = f"./res/{DATASET}-processed"
 32 | CUSIM_MODEL = "./res/cusim.w2v.model"
 33 | GENSIM_MODEL = "./res/gensim.w2v.model"
 34 | 
 35 | 
 36 | # common hyperparameters
 37 | MIN_COUNT = 5
 38 | LEARNING_RATE = 0.001
 39 | NEG_SIZE = 10
 40 | NUM_DIMS = 100
 41 | CBOW_MEAN = False
 42 | EPOCHS = 10
 43 | 
 44 | 
 45 | def download():
 46 |   if os.path.exists(DATA_PATH):
 47 |     LOGGER.info("%s already exists", DATA_PATH)
 48 |     return
 49 |   if not os.path.exists(DOWNLOAD_PATH):
 50 |     os.makedirs(DOWNLOAD_PATH, exist_ok=True)
 51 |   api.BASE_DIR = DOWNLOAD_PATH
 52 |   filepath = api.load(DATASET, return_path=True)
 53 |   LOGGER.info("filepath: %s", filepath)
 54 |   cmd = ["gunzip", "-c", filepath, ">", DATA_PATH]
 55 |   cmd = " ".join(cmd)
 56 |   LOGGER.info("cmd: %s", cmd)
 57 |   subprocess.call(cmd, shell=True)
 58 |   preprocess_data()
 59 | 
 60 | def preprocess_data():
 61 |   tokenizer = RegexpTokenizer(r'\w+')
 62 |   nltk.download("wordnet")
 63 |   lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
 64 |   fout = open(DATA_PATH + ".tmp", "wb")
 65 |   with open(DATA_PATH, "rb") as fin:
 66 |     for line in tqdm.tqdm(fin):
 67 |       line = line.decode("utf8").strip()
 68 |       line = preprocess_line(line, tokenizer, lemmatizer)
 69 |       fout.write((line + "\n").encode("utf8"))
 70 |   fout.close()
 71 |   os.rename(DATA_PATH + ".tmp", DATA_PATH)
 72 | 
 73 | def preprocess_line(line, tokenizer, lemmatizer):
 74 |   line = line.lower()
 75 |   line = tokenizer.tokenize(line)
 76 |   line = [token for token in line
 77 |           if not token.isnumeric() and len(token) > 1]
 78 |   line = [lemmatizer.lemmatize(token) for token in line]
 79 |   return " ".join(line)
 80 | 
 81 | def run_cusim(skip_gram=False, hierarchical_softmax=False):
 82 |   download()
 83 |   opt = {
 84 |     "data_path": DATA_PATH,
 85 |     "processed_data_dir": PROCESSED_DATA_DIR,
 86 |     # "skip_preprocess": os.path.exists(PROCESSED_DATA_DIR),
 87 |     "num_dims": NUM_DIMS,
 88 |     "epochs": EPOCHS,
 89 |     "word_min_count": MIN_COUNT,
 90 |     "lr": 0.001,
 91 |     "io": {
 92 |       "lower": False
 93 |     },
 94 |     "neg": 0 if hierarchical_softmax else NEG_SIZE,
 95 |     "skip_gram": skip_gram,
 96 |     "cbow_mean": CBOW_MEAN,
 97 |   }
 98 |   start = time.time()
 99 |   w2v = CuW2V(opt)
100 |   w2v.train_model()
101 |   elapsed = time.time() - start
102 |   LOGGER.info("elapsed for cusim w2v training: %.4e sec", elapsed)
103 |   w2v.save_word2vec_format(CUSIM_MODEL, binary=False)
104 |   return elapsed, evaluate_w2v_model(CUSIM_MODEL)
105 | 
106 | def run_gensim(skip_gram=False, hierarchical_softmax=False, workers=8):
107 |   download()
108 |   start = time.time()
109 |   model = gensim.models.Word2Vec(corpus_file=DATA_PATH, workers=workers,
110 |                                  sg=skip_gram, hs=hierarchical_softmax,
111 |                                  min_alpha=LEARNING_RATE, min_count=MIN_COUNT,
112 |                                  alpha=LEARNING_RATE, negative=NEG_SIZE,
113 |                                  iter=EPOCHS, cbow_mean=CBOW_MEAN,
114 |                                  size=NUM_DIMS)
115 |   elapsed = time.time() - start
116 |   LOGGER.info("elapsed for gensim w2v training: %.4e sec", elapsed)
117 |   model.wv.save_word2vec_format(GENSIM_MODEL, binary=False)
118 |   LOGGER.info("gensim w2v model is saved to %s", GENSIM_MODEL)
119 |   return elapsed, evaluate_w2v_model(GENSIM_MODEL)
120 | 
121 | def evaluate_w2v_model(model=GENSIM_MODEL):
122 |   LOGGER.info("load word2vec format model from %s", model)
123 |   model = gensim.models.KeyedVectors.load_word2vec_format(model)
124 |   results = model.wv.evaluate_word_pairs(datapath("wordsim353.tsv"),
125 |                                          case_insensitive=False)
126 |   LOGGER.info("evaluation results: %s", results)
127 |   return results
128 | 
129 | # gpu model variable is for being displayed in markdown
130 | # please put the real gpu modelname
131 | def run_experiments(skip_gram=False, hierarchical_softmax=False,
132 |                     gpu_model="NVIDIA T4"):
133 |   training_time = {"attr": "training time (sec)"}
134 |   pearson = {"attr": "pearson"}
135 |   spearman = {"attr": "spearman"}
136 |   for i in [1, 2, 4, 8]:
137 |     elapsed, evals = run_gensim(skip_gram, hierarchical_softmax, i)
138 |     training_time[f"{i} workers (gensim)"] = elapsed
139 |     pearson[f"{i} workers (gensim)"] = evals[0][0]
140 |     spearman[f"{i} workers (gensim)"] = evals[1][0]
141 |   elapsed, evals = run_cusim(skip_gram, hierarchical_softmax)
142 |   gpu_title = f"{gpu_model} (cusim)"
143 |   training_time[gpu_title] = elapsed
144 |   pearson[gpu_title] = evals[0][0]
145 |   spearman[gpu_title] = evals[1][0]
146 |   df0 = pd.DataFrame([training_time, pearson, spearman])
147 |   df0.set_index("attr", inplace=True)
148 |   print(df0.to_markdown())
149 | 
150 | # gpu model variable is for being displayed in markdown
151 | # please put the real gpu modelname
152 | def run_various_experiments(gpu_model="NVIDIA T4"):
153 |   for sg0 in [True, False]:
154 |     for hs0 in [True, False]:
155 |       print("=" * 100)
156 |       LOGGER.info("setting: %s, %s",
157 |                   "skip gram" if sg0 else "cbow",
158 |                   "hierarchical softmax" if hs0 else "negative sampling")
159 |       run_experiments(sg0, hs0, gpu_model)
160 | 
161 | 
162 | if __name__ == "__main__":
163 |   fire.Fire()
164 | 


--------------------------------------------------------------------------------
/examples/requirements.txt:
--------------------------------------------------------------------------------
1 | fire
2 | gensim==3.8.3
3 | nltk
4 | tqdm
5 | wget
6 | pandas
7 | tabulate
8 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools>=1.3.2",
4 |     "numpy",
5 |     "pybind11"
6 | ]
7 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | h5py
2 | jsmin
3 | numpy
4 | scipy
5 | pybind11
6 | protobuf==3.10.0
7 | grpcio-tools==1.27.1
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2021 Jisang Yoon
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the Apache 2.0 license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | # pylint: disable=fixme,too-few-public-methods
  8 | # reference: https://github.com/kakao/buffalo/blob/
  9 | # 5f571c2c7d8227e6625c6e538da929e4db11b66d/setup.py
 10 | """cusim
 11 | """
 12 | import os
 13 | import sys
 14 | import glob
 15 | import pathlib
 16 | import platform
 17 | import sysconfig
 18 | import subprocess
 19 | from setuptools import setup, Extension
 20 | 
 21 | import pybind11
 22 | import numpy as np
 23 | from cuda_setup import CUDA, BUILDEXT
 24 | 
 25 | 
 26 | DOCLINES = __doc__.split("\n")
 27 | 
 28 | # TODO: Python3 Support
 29 | if sys.version_info[:3] < (3, 6):
 30 |   raise RuntimeError("Python version 3.6 or later required.")
 31 | 
 32 | assert platform.system() == 'Linux'  # TODO: MacOS
 33 | with open("requirements.txt", "r") as fin:
 34 |   INSTALL_REQUIRES = [line.strip() for line in fin]
 35 | 
 36 | MAJOR = 0
 37 | MINOR = 0
 38 | MICRO = 2
 39 | RELEASE = True
 40 | STAGE = {True: '', False: 'b'}.get(RELEASE)
 41 | VERSION = f'{MAJOR}.{MINOR}.{MICRO}{STAGE}'
 42 | STATUS = {False: 'Development Status :: 4 - Beta',
 43 |           True: 'Development Status :: 5 - Production/Stable'}
 44 | 
 45 | CLASSIFIERS = """{status}
 46 | Programming Language :: C++
 47 | Programming Language :: Python :: 3.6
 48 | Operating System :: POSIX :: Linux
 49 | Operating System :: Unix
 50 | Operating System :: MacOS
 51 | License :: OSI Approved :: Apache Software License""".format( \
 52 |   status=STATUS.get(RELEASE))
 53 | CLIB_DIR = os.path.join(sysconfig.get_path('purelib'), 'cusim')
 54 | LIBRARY_DIRS = [CLIB_DIR]
 55 | 
 56 | 
 57 | def get_extend_compile_flags():
 58 |   flags = ['-march=native']
 59 |   return flags
 60 | 
 61 | 
 62 | class CMakeExtension(Extension):
 63 |   extension_type = 'cmake'
 64 | 
 65 |   def __init__(self, name):
 66 |     super().__init__(name, sources=[])
 67 | 
 68 | 
 69 | extend_compile_flags = get_extend_compile_flags()
 70 | extra_compile_args = ['-fopenmp', '-std=c++14', '-ggdb', '-O3'] + \
 71 |   extend_compile_flags
 72 | util_srcs = glob.glob("cpp/src/utils/*.cc")
 73 | extensions = [
 74 |   Extension("cusim.ioutils.ioutils_bind",
 75 |             sources = util_srcs + [ \
 76 |               "cusim/ioutils/bindings.cc",
 77 |               "3rd/json11/json11.cpp"],
 78 |             language="c++",
 79 |             extra_compile_args=extra_compile_args,
 80 |             extra_link_args=["-fopenmp"],
 81 |             extra_objects=[],
 82 |             include_dirs=[ \
 83 |               "cpp/include/", np.get_include(), pybind11.get_include(),
 84 |               pybind11.get_include(True),
 85 |               "3rd/json11", "3rd/spdlog/include"]),
 86 |   Extension("cusim.culda.culda_bind",
 87 |             sources= util_srcs + [ \
 88 |               "cpp/src/culda/culda.cu",
 89 |               "cusim/culda/bindings.cc",
 90 |               "3rd/json11/json11.cpp"],
 91 |             language="c++",
 92 |             extra_compile_args=extra_compile_args,
 93 |             extra_link_args=["-fopenmp"],
 94 |             library_dirs=[CUDA['lib64']],
 95 |             libraries=['cudart', 'curand'],
 96 |             extra_objects=[],
 97 |             include_dirs=[ \
 98 |               "cpp/include/", np.get_include(), pybind11.get_include(),
 99 |               pybind11.get_include(True), CUDA['include'],
100 |               "3rd/json11", "3rd/spdlog/include"]),
101 |   Extension("cusim.cuw2v.cuw2v_bind",
102 |             sources= util_srcs + [ \
103 |               "cpp/src/cuw2v/cuw2v.cu",
104 |               "cusim/cuw2v/bindings.cc",
105 |               "3rd/json11/json11.cpp"],
106 |             language="c++",
107 |             extra_compile_args=extra_compile_args,
108 |             extra_link_args=["-fopenmp"],
109 |             library_dirs=[CUDA['lib64']],
110 |             libraries=['cudart', 'curand'],
111 |             extra_objects=[],
112 |             include_dirs=[ \
113 |               "cpp/include/", np.get_include(), pybind11.get_include(),
114 |               pybind11.get_include(True), CUDA['include'],
115 |               "3rd/json11", "3rd/spdlog/include"]),
116 | ]
117 | 
118 | 
119 | # Return the git revision as a string
120 | def git_version():
121 |   def _minimal_ext_cmd(cmd):
122 |     # construct minimal environment
123 |     env = {}
124 |     for k in ['SYSTEMROOT', 'PATH']:
125 |       val = os.environ.get(k)
126 |       if val is not None:
127 |         env[k] = val
128 |     out = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=env). \
129 |       communicate()[0]
130 |     return out
131 | 
132 |   try:
133 |     out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
134 |     git_revision = out.strip().decode('ascii')
135 |   except OSError:
136 |     git_revision = "Unknown"
137 | 
138 |   return git_revision
139 | 
140 | 
141 | def write_version_py(filename='cusim/version.py'):
142 |   cnt = """
143 | short_version = '%(version)s'
144 | git_revision = '%(git_revision)s'
145 | """
146 |   git_revision = git_version()
147 |   with open(filename, 'w') as fout:
148 |     fout.write(cnt % {'version': VERSION,
149 |               'git_revision': git_revision})
150 | 
151 | 
152 | class BuildExtension(BUILDEXT):
153 |   def run(self):
154 |     for ext in self.extensions:
155 |       print(ext.name)
156 |       if hasattr(ext, 'extension_type') and ext.extension_type == 'cmake':
157 |         self.cmake()
158 |     super().run()
159 | 
160 |   def cmake(self):
161 |     cwd = pathlib.Path().absolute()
162 | 
163 |     build_temp = pathlib.Path(self.build_temp)
164 |     build_temp.mkdir(parents=True, exist_ok=True)
165 | 
166 |     build_type = 'Debug' if self.debug else 'Release'
167 | 
168 |     cmake_args = [
169 |       '-DCMAKE_BUILD_TYPE=' + build_type,
170 |       '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + CLIB_DIR,
171 |     ]
172 | 
173 |     build_args = []
174 | 
175 |     os.chdir(str(build_temp))
176 |     self.spawn(['cmake', str(cwd)] + cmake_args)
177 |     if not self.dry_run:
178 |       self.spawn(['cmake', '--build', '.'] + build_args)
179 |     os.chdir(str(cwd))
180 | 
181 | 
182 | def setup_package():
183 |   write_version_py()
184 |   cmdclass = {
185 |     'build_ext': BuildExtension
186 |   }
187 | 
188 |   metadata = dict(
189 |     name='cusim',
190 |     maintainer="Jisang Yoon",
191 |     maintainer_email="vjs10101v@gmail.com",
192 |     author="Jisang Yoon",
193 |     author_email="vjs10101v@gmail.com",
194 |     description=DOCLINES[0],
195 |     long_description="\n".join(DOCLINES[2:]),
196 |     url="https://github.com/js1010/cusim",
197 |     download_url="https://github.com/js1010/cusim/releases",
198 |     include_package_data=False,
199 |     license='Apache2',
200 |     packages=['cusim/', "cusim/ioutils/", "cusim/culda/", "cusim/cuw2v/"],
201 |     install_requires=INSTALL_REQUIRES,
202 |     cmdclass=cmdclass,
203 |     classifiers=[_f for _f in CLASSIFIERS.split('\n') if _f],
204 |     platforms=['Linux', 'Mac OSX', 'Unix'],
205 |     ext_modules=extensions,
206 |     entry_points={
207 |       'console_scripts': [
208 |       ]
209 |     },
210 |     python_requires='>=3.6',
211 |   )
212 | 
213 |   metadata['version'] = VERSION
214 |   setup(**metadata)
215 | 
216 | 
217 | if __name__ == '__main__':
218 |   setup_package()
219 | 


--------------------------------------------------------------------------------