├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── requirements.txt ├── speech ├── __init__.py ├── ctc.py ├── data │ ├── __init__.py │ ├── dataloader.py │ └── ingest_librispeech.py ├── decoder.py ├── evaluate.py ├── sample_proposals_callback.py ├── train.py └── utils.py └── src └── transforms ├── .gitignore ├── Makefile └── warp-ctc ├── CMakeLists.txt ├── LICENSE ├── README.md ├── doc ├── baidu-research-logo-small.png └── deep-speech-ctc-small.png ├── examples ├── loader.py ├── rnnctc.py └── simple.py ├── include ├── contrib │ └── moderngpu │ │ ├── LICENSE │ │ └── include │ │ ├── device │ │ ├── ctaloadbalance.cuh │ │ ├── ctamerge.cuh │ │ ├── ctascan.cuh │ │ ├── ctasearch.cuh │ │ ├── ctasegreduce.cuh │ │ ├── ctasegscan.cuh │ │ ├── ctasegsort.cuh │ │ ├── ctasortedsearch.cuh │ │ ├── devicetypes.cuh │ │ ├── deviceutil.cuh │ │ ├── intrinsics.cuh │ │ ├── loadstore.cuh │ │ ├── serialsets.cuh │ │ └── sortnetwork.cuh │ │ ├── mgpudevice.cuh │ │ ├── mgpuenums.h │ │ └── util │ │ └── static.h ├── ctc.h └── detail │ ├── cpu_ctc.h │ ├── ctc_helper.h │ ├── gpu_ctc.h │ ├── gpu_ctc_kernels.h │ ├── hostdevice.h │ └── reduce.h ├── python ├── __init__.py ├── ctc.py └── setup.py ├── src ├── ctc_entrypoint.cpp ├── ctc_entrypoint.cu └── reduce.cu └── tests ├── test.h ├── test_cpu.cpp └── test_gpu.cu /.gitignore: -------------------------------------------------------------------------------- 1 | *.sublime-project 2 | *.sublime-workspace 3 | *.pyc 4 | *.pkl 5 | *.prm 6 | *.so 7 | *.swo 8 | *.swp 9 | .DS_Store 10 | neon/version.py 11 | *@eaDir 12 | .pkgs 13 | *.egg-info 14 | .venv 15 | .venv[23] 16 | .styleenv 17 | .coverage 18 | build 19 | *.gz 20 | generated 21 | *.ropeproject 22 | *.cubin 23 | *.hdf5 24 | *.h5 25 | *.html 26 | *.txt 27 | *.log 28 | neon_help_output.txt 29 | neon/backends/util/cuda_capability 30 | neon/backends/kernels/ptx 31 | neon/backends/kernels/pre 32 | neon/backends/kernels/dump 33 | neon/data/loader/loader 34 | .idea/ 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: default all clean 2 | 3 | default: all 4 | 5 | all: src/transforms/libwarpctc.so 6 | 7 | clean: 8 | @$(MAKE) -C src/transforms clean 9 | @find . -name '*.pyc' -delete 10 | 11 | src/transforms/libwarpctc.so: 12 | @$(MAKE) -C src/transforms 13 | 14 | 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DISCONTINUATION OF PROJECT # 2 | This project will no longer be maintained by Intel. 3 | Intel has ceased development and contributions including, but not limited to, maintenance, bug fixes, new releases, or updates, to this project. 4 | Intel no longer accepts patches to this project. 5 | # Implementation of Deep Speech 2 in neon 6 | 7 | This repository contains an implementation of Baidu SVAIL's [Deep Speech 2] 8 | model in neon. Much of the model is readily available in mainline neon; to also 9 | support the CTC cost function, we have included a neon-compatible wrapper for 10 | Baidu's [Warp-CTC]. 11 | 12 | Deep Speech 2 models are computationally intensive, and thus they can 13 | require long periods of time to run. Even with near-perfect GPU utilization, 14 | the model can take up to 1 week to train on large enough datasets to see 15 | respectable performance. Please keep this in mind when exploring this repo. 16 | 17 | We have used this code to train models on both the Wall Street Journal 18 | (81 hours) and Librispeech (1000 hours) datasets. The WSJ dataset is 19 | available through the LDC only; however, Librispeech can be freely acquired 20 | from [Librispeech corpus]. 21 | 22 | The model presented here uses a basic argmax-based decoder: 23 | 24 | * Choose the most probable character in each frame 25 | * Collapse the resulting output string according to CTC's rules: remove repeat 26 | characters first, remove blank characters next. 27 | 28 | After decoding, you might expect outputs like this when trained on WSJ data: 29 | 30 | | Ground truth | Model output | 31 | |---------------------------------|-----------------------------------| 32 | | united presidential is a life insurance company | younited presidentiol is a lefe in surance company | 33 | | that was certainly true last week | that was sertainly true last week | 34 | | we're not ready to say we're in technical default a spokesman said | we're now ready to say we're intechnical default a spokesman said | 35 | 36 | Or outputs like this when trained on Librispeech (see "Decoding and 37 | evaluating a trained model"): 38 | 39 | | Ground truth | Model output | 40 | |---------------------------------|-----------------------------------| 41 | | this had some effect in calming him | this had some offectind calming him | 42 | | he went in and examined his letters but there was nothing from carrie | he went in an examined his letters but there was nothing from carry | 43 | | the design was different but the thing was clearly the same | the design was differampat that thing was clarly the same | 44 | 45 | ## Getting Started 46 | 1. [neon 2.3.0] and the [aeon] dataloader (v1.0.0) must both be installed. 47 | 48 | 2. Clone the repo: ```git clone https://github.com/NervanaSystems/deepspeech.git && cd deepspeech```. 49 | 50 | 3. Within a neon virtualenv, run ```pip install -r requirements.txt```. 51 | 52 | 4. Run ```make``` to build warp-ctc. 53 | 54 | ## Training a model 55 | ### 1. Prepare a manifest file for your dataset. 56 | The details on how to go about doing this are determined by the specifics of 57 | the dataset. 58 | 59 | 60 | #### Example: Librispeech recipe 61 | A recipe for ingesting Librispeech data is provided in ``data/ingest_librispeech.py``. 62 | Note that Librispeech provides distinct datasets for training and validation, 63 | and each set must be ingested separately. Additionally, we'll have to 64 | get around the quirky way that the Librispeech data is distributed; after 65 | "unpacking" the archives, we should re-pack them in a consistent manner. 66 | 67 | To be more precise, Librispeech data is distributed in zipped tar files, e.g. 68 | `train-clean-100.tar.gz` for training and `dev-clean.tar.gz` for validation. 69 | Upon unpacking, each archive creates a directory named ``LibriSpeech``, so 70 | trying to unpack both files together in the same directory is a bad idea. To 71 | get around this, try something like: 72 | 73 | ``` 74 | $ mkdir librispeech && cd librispeech 75 | $ wget http://www.openslr.org/resources/12/train-clean-100.tar.gz 76 | $ wget http://www.openslr.org/resources/12/dev-clean.tar.gz 77 | $ tar xvzf dev-clean.tar.gz LibriSpeech/dev-clean --strip-components=1 78 | $ tar xvzf train-clean-100.tar.gz LibriSpeech/train-clean-100 --strip-components=1 79 | ``` 80 | 81 | Follow the above prescription and you will have the training data as a 82 | subdirectory `librispeech/train-clean-100` and the validation data in a 83 | subdirectory `librispeech/dev-clean`. To ingest the data, you would then run the 84 | python script on the directory where you've unpacked the clean training data, 85 | followed by directions to where you want the script to write the transcripts and 86 | training mainfests for that dataset: 87 | 88 | ``` 89 | $ python data/ingest_librispeech.py 90 | ``` 91 | 92 | For example, if the absolute path to the train-clean-100 directory is located in 93 | ``/usr/local/data/librispeech/train-clean-100``, run: 94 | 95 | ``` 96 | $ python data/ingest_librispeech.py /usr/local/data/librispeech/train-clean-100 /usr/local/data/librispeech/train-clean-100/transcripts_dir /usr/local/data/librispeech/train-clean-100/train-manifest.csv 97 | ``` 98 | 99 | which would create a training manifest file named train-manifest.csv. Similarly, 100 | if the absolute path to the dev-clean directory is located at 101 | ``/usr/local/data/librispeech/dev-clean``, run: 102 | 103 | ``` 104 | $ python data/ingest_librispeech.py /usr/local/data/librispeech/dev-clean /usr/local/data/librispeech/dev-clean/transcripts_dir /usr/local/data/librispeech/train-clean-100/val-manifest.csv 105 | ``` 106 | 107 | To train on the full 1000 hours, execute the same commands for the 360 hour 108 | and 540 hour training datasets as well. The manifest files can then be 109 | concatenated with a simple: 110 | ``` 111 | $ cat /path/to/100_hour_manifest.csv /path/to/360_hour_manifest.csv /path/to/540_hour_manifest.csv > /path/to/1000_hour_manifest.csv 112 | ``` 113 | 114 | 115 | ### 2a. Train a new model 116 | 117 | ``` 118 | $ python train.py --manifest train: --manifest val: -e -z -s [-b ] 119 | ``` 120 | 121 | where `` is the path to the training manifest file produced 122 | in the ingest. For the example above, that path is ``/usr/local/data/librispeech/train-clean-100/train-manifest.csv``) 123 | and `` is the path to the validation manifest file. 124 | 125 | ### 2b. Continue training after pause on a previous model 126 | For a previously-trained model that wasn't trained for the full time needed, it's 127 | possible to resume training by passing the `--model_file ` 128 | argument to `train.py`. For example, you could continue training a pre-trained 129 | model from our [Model Zoo] sample. 130 | This particular model was trained using 1000 hours of speech data from the 131 | [Librispeech corpus]. The model was trained for 132 | 16 epochs after attaining a Character Error Rate (CER) of 14% without using a 133 | language model. You could continue training it for, say, an additional 4 epochs, 134 | by calling: 135 | 136 | ``` 137 | $ python train.py --manifest train: --manifest val: -e20 -z -s --model_file [-b ] 138 | ``` 139 | 140 | which will save a new model to `model_output.prm`. 141 | 142 | ## Decoding and evaluating a trained model 143 | After you have a trained model, it's easy to evaluate its performance on any 144 | given dataset. Simply create a manifest file and then call: 145 | 146 | ``` 147 | $ python evaluate.py --manifest val:/path/to/manifest.csv --model_file /path/to/saved_model.prm 148 | ``` 149 | 150 | replacing the file paths as needed. It prints CERs (Character Error Rates) by 151 | default. To instead print WERs (Word Error Rates), include the argument 152 | `--use_wer`. 153 | 154 | For example, you could evaluate our pre-trained model from our [Model Zoo]. To 155 | evaluate the pre-trained model, follow these steps: 156 | 157 | 1. Download some test data from the Librispeech ASR corpus and prepare a 158 | manifest file for the dataset that follows the prescription provided above. 159 | 160 | 2. Download the [pre-trained DS2 model from our Model Zoo]. 161 | 162 | 3. Subject the pre-trained model and the manifest file for the test data to the 163 | `evaluate.py` script, as described above. 164 | 165 | 4. Optionally inspect the transcripts produced by the trained model; this can 166 | be done by appending it with the argument `--inference_file `. 167 | The result dumps the model transcripts together with the corresponding 168 | "ground truth" transcripts to a pickle file. 169 | 170 | 171 | [Deep Speech 2]:https://arxiv.org/abs/1512.02595 172 | [neon 2.3.0]:https://github.com/NervanaSystems/neon 173 | [aeon]:https://github.com/NervanaSystems/aeon 174 | [Warp-CTC]: https://github.com/baidu-research/warp-ctc 175 | [Librispeech corpus]:http://www.openslr.org/12 176 | [Model Zoo]:https://github.com/NervanaSystems/ModelZoo 177 | [pre-trained DS2 model from our Model Zoo]:https://s3-us-west-1.amazonaws.com/nervana-modelzoo/Deep_Speech/Librispeech/librispeech_16_epochs.prm 178 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python-levenshtein==0.12.0 2 | tqdm==4.8.4 3 | -------------------------------------------------------------------------------- /speech/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NervanaSystems/deepspeech/2ea95f5ce1bb39fa4de26807beed905b7889de59/speech/__init__.py -------------------------------------------------------------------------------- /speech/ctc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2015-2016 Nervana Systems Inc. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ---------------------------------------------------------------------------- 16 | from neon import NervanaObject 17 | import platform 18 | import os 19 | import numpy as np 20 | import numpy.ctypeslib as npct 21 | import ctypes as ct 22 | import ctypes.util 23 | from neon.transforms.cost import Cost 24 | 25 | 26 | class CTC(Cost): 27 | 28 | def __init__(self, max_label_len, nout=29, blank=0): 29 | 30 | self.max_s = int(max_label_len) 31 | self.nout = nout 32 | self.input_warp = None 33 | self.y = None 34 | self.input_lengths = self.be.zeros((self.be.bsz), dtype=np.int32) 35 | 36 | self.ctclib = None 37 | 38 | def init_buffer(self, y): 39 | 40 | if self.input_warp is None or self.y is None or self.y is not y: 41 | self.y = y 42 | # warp-CTC requires activations.shape = (T, bsz, nout) 43 | self.input_warp = self.be.zeros( 44 | (self.max_t, self.be.bsz, self.nout)) 45 | # warp-CTC requires gradients.shape = (T, bsz, nout) 46 | self.grad_warp = self.be.zeros( 47 | (self.max_t, self.be.bsz, self.nout)) 48 | # warp-CTC requires cost.shape = (1, bsz) 49 | self.ctc_warp = self.be.zeros((1, self.be.bsz)) 50 | 51 | # neon requires gradients.shape = (nout, T*bsz) 52 | self.grad = self.be.iobuf((self.nout, self.max_t)) 53 | self.grad_view = self.grad.reshape( 54 | self.nout, self.max_t, self.be.bsz) 55 | # neon requires cost.shape = (nout, T*bsz) 56 | self.ctc_cost = self.be.zeros((1, self.max_t * self.be.bsz)) 57 | self.ctc_cost_view = self.ctc_cost.reshape( 58 | self.max_t, self.be.bsz).T 59 | 60 | def __call__(self, y, t): 61 | 62 | activations = y.reshape(self.nout, -1, self.be.bsz) 63 | self.max_t = activations.shape[1] 64 | self.init_buffer(y) 65 | self.grad_warp.fill(0.) 66 | self.ctc_warp.fill(0.) 67 | self.ctc_cost.fill(0.) 68 | # flat_labels: minibatch worth of transcripts 69 | flat_labels = t[0] 70 | 71 | # label_lengths: minibatch worth of transcript lengths 72 | label_lengths = t[1] 73 | 74 | # input_lengths: minibatch worth of activation lengths 75 | self.input_lengths[:] = t[2].T * int(activations.shape[1]) / 100 76 | 77 | # reshape activations to match warp-CTC format 78 | self.be.copy_transpose(activations, self.input_warp, (1, 2, 0)) 79 | 80 | # call into warp-CTC 81 | self.be_ctc( 82 | self.nout, 83 | self.input_warp, 84 | flat_labels, 85 | self.grad_warp, 86 | label_lengths, 87 | self.input_lengths, 88 | self.ctc_warp) 89 | 90 | # warp-ctc only returns 1 cost for each example 91 | # broadcast ctc_warp (shape = (1,bsz)) to ctc_cost (shape=(1, T*bsz)) 92 | self.ctc_cost_view[:] = self.ctc_warp.T 93 | 94 | return self.ctc_cost 95 | 96 | def be_ctc( 97 | self, 98 | nout, 99 | inputs, 100 | labels, 101 | grads, 102 | label_lens, 103 | input_lens, 104 | costs): 105 | 106 | libpath = os.path.join(os.path.dirname(__file__), 107 | "..", "src", "transforms", "libwarpctc.so") 108 | assert os.path.isfile(libpath), "libwarpctc.so not found. Run make" 109 | self.ctclib = npct.load_library(libpath, "") 110 | 111 | if self.be.backend_name == "gpu": 112 | self.be_ctc_gpu( 113 | nout, 114 | inputs, 115 | labels, 116 | grads, 117 | label_lens, 118 | input_lens, 119 | costs) 120 | elif self.be.backend_name == "cpu" or self.be.backend_name == "mkl": 121 | self.be_ctc_cpu( 122 | inputs, 123 | labels, 124 | grads, 125 | label_lens, 126 | input_lens, 127 | costs, 128 | nout) 129 | else: 130 | raise NotImplementedError() 131 | 132 | def be_ctc_gpu( 133 | self, 134 | nout, 135 | inputs, 136 | labels, 137 | grads, 138 | label_lens, 139 | input_lens, 140 | costs): 141 | """ 142 | Calling Warp-CTC 143 | """ 144 | 145 | # Set up cuda stream 146 | if self.be.stream is None: 147 | stream_buf = ct.cast(self.be.stream, ct.c_void_p) 148 | else: 149 | stream_buf = ct.cast( 150 | id(self.be.stream), ct.POINTER(ct.c_void_p)).contents 151 | 152 | # map first function to get workspace size 153 | self.ctclib.get_workspace_size_gpu.restype = int 154 | self.ctclib.get_workspace_size_gpu.argtypes = [npct.ndpointer(dtype=np.int32, ndim=1), 155 | npct.ndpointer(dtype=np.int32, ndim=1), 156 | ct.c_int, 157 | ct.c_int, 158 | ct.c_void_p] 159 | scratch_size = self.ctclib.get_workspace_size_gpu(np.array(label_lens.get().ravel(), 160 | dtype=np.int32), 161 | np.array(input_lens.get().ravel(), 162 | dtype=np.int32), 163 | nout, self.be.bsz, 164 | stream_buf) 165 | self.be.set_scratch_size(scratch_size) 166 | workspace = self.be.scratch_buffer(scratch_size) 167 | 168 | # map ctc function 169 | self.ctclib.compute_ctc_loss_gpu.restype = int 170 | self.ctclib.compute_ctc_loss_gpu.argtypes = [ct.POINTER(ct.c_float), 171 | ct.POINTER(ct.c_float), 172 | npct.ndpointer(dtype=np.int32, ndim=1), 173 | npct.ndpointer(dtype=np.int32, ndim=1), 174 | npct.ndpointer(dtype=np.int32, ndim=1), 175 | ct.c_int, 176 | ct.c_int, 177 | ct.POINTER(ct.c_float), 178 | ct.POINTER(ct.c_char), 179 | ct.c_void_p] 180 | 181 | inputs_buf = ct.cast(int(inputs.gpudata), ct.POINTER(ct.c_float)) 182 | grads_buf = ct.cast(int(grads.gpudata), ct.POINTER(ct.c_float)) 183 | costs_buf = ct.cast(int(costs.gpudata), ct.POINTER(ct.c_float)) 184 | workspace_buf = ct.cast(workspace, ct.POINTER(ct.c_char)) 185 | 186 | status = self.ctclib.compute_ctc_loss_gpu(inputs_buf, 187 | grads_buf, 188 | np.array(labels.get().ravel(), 189 | dtype=np.int32), 190 | np.array(label_lens.get().ravel(), 191 | dtype=np.int32), 192 | np.array(input_lens.get().ravel(), 193 | dtype=np.int32), 194 | nout, 195 | self.be.bsz, 196 | costs_buf, 197 | workspace_buf, 198 | stream_buf) 199 | 200 | assert status is 0, "Warp-CTC run failed" 201 | return 202 | 203 | def be_ctc_cpu( 204 | self, 205 | inputs, 206 | labels, 207 | grads, 208 | label_lens, 209 | input_lens, 210 | costs, 211 | nout): 212 | """ 213 | Calling Warp-CTC 214 | """ 215 | 216 | # Workspace is allocated in ctc_entrypoint.cpp since the CPU backend in neon doesn't have 217 | # scratch space support 218 | # Map compute_ctc_loss 219 | self.ctclib.compute_ctc_loss_cpu.restype = int 220 | self.ctclib.compute_ctc_loss_cpu.argtypes = [ 221 | npct.ndpointer(dtype=np.float32, ndim=3), 222 | npct.ndpointer(dtype=np.float32, ndim=3), 223 | npct.ndpointer(dtype=np.int32, ndim=1), 224 | npct.ndpointer(dtype=np.int32, ndim=1), 225 | npct.ndpointer(dtype=np.int32, ndim=1), 226 | ctypes.c_int, 227 | ctypes.c_int, 228 | npct.ndpointer(dtype=np.float32, ndim=1), 229 | ctypes.c_int] 230 | 231 | num_threads = 8 232 | _inputs = np.array(inputs.get(), dtype=np.float32) 233 | _grads = np.array(grads.get(), dtype=np.float32) 234 | _labels = np.array(labels.get().ravel(), dtype=np.int32) 235 | _label_lens = np.array(label_lens.get().ravel(), dtype=np.int32) 236 | _input_lens = np.array(input_lens.get().ravel(), dtype=np.int32) 237 | _costs = np.array(costs.get().ravel(), dtype=np.float32) 238 | status = self.ctclib.compute_ctc_loss_cpu(_inputs, 239 | _grads, 240 | _labels, 241 | _label_lens, 242 | _input_lens, 243 | nout, 244 | self.be.bsz, 245 | _costs, 246 | num_threads) 247 | 248 | assert status is 0, "Warp-CTC run failed" 249 | costs[:] = _costs 250 | grads[:] = _grads 251 | return 252 | 253 | def bprop(self, y, t): 254 | # warp-ctc returns grads with shape (T, bsz, nout), 255 | # so reshape warp-ctc grads to match neon grads 256 | self.be.copy_transpose(self.grad_warp, self.grad_view, (2, 0, 1)) 257 | 258 | return self.grad 259 | -------------------------------------------------------------------------------- /speech/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NervanaSystems/deepspeech/2ea95f5ce1bb39fa4de26807beed905b7889de59/speech/data/__init__.py -------------------------------------------------------------------------------- /speech/data/dataloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2017 Nervana Systems Inc. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ---------------------------------------------------------------------------- 16 | import os 17 | import numpy as np 18 | from neon.data.aeon_shim import AeonDataLoader 19 | from neon.data.dataloader_transformers import TypeCast, Retuple 20 | 21 | 22 | def common_config(manifest_file, batch_size, alphabet, nbands, max_tscrpt_len, max_utt_len): 23 | 24 | audio_config = {"type": "audio", 25 | "sample_freq_hz": 16000, 26 | "max_duration": "{} seconds".format(max_utt_len), 27 | "frame_length": "25 milliseconds", 28 | "frame_stride": "10 milliseconds", 29 | "feature_type": "mfsc", 30 | "emit_length": True, 31 | "num_filters": nbands} 32 | 33 | transcription_config = {"type": "char_map", 34 | "alphabet": alphabet, 35 | "emit_length": True, 36 | "max_length": max_tscrpt_len} 37 | 38 | return {'manifest_filename': manifest_file, 39 | 'manifest_root': os.path.dirname(manifest_file), 40 | 'batch_size': batch_size, 41 | 'block_size': batch_size, 42 | 'etl': [audio_config, transcription_config]} 43 | 44 | 45 | def wrap_dataloader(dl): 46 | """ Data is loaded from Aeon as a 4-tuple. We need to cast the audio 47 | (index 0) from int8 to float32 and repack the data into (audio, 3-tuple). 48 | """ 49 | 50 | dl = TypeCast(dl, index=0, dtype=np.float32) 51 | dl = Retuple(dl, data=(0,), target=(2, 3, 1)) 52 | return dl 53 | 54 | 55 | def make_loader(manifest_file, alphabet, nbands, max_tscrpt_len, max_utt_len, backend_obj): 56 | aeon_config = common_config(manifest_file, backend_obj.bsz, alphabet, nbands, max_tscrpt_len, 57 | max_utt_len) 58 | return wrap_dataloader(AeonDataLoader(aeon_config)) 59 | -------------------------------------------------------------------------------- /speech/data/ingest_librispeech.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2015-2016 Nervana Systems Inc. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ---------------------------------------------------------------------------- 16 | 17 | import glob 18 | import logging 19 | import os 20 | 21 | logging.basicConfig() 22 | logger = logging.getLogger(__name__) 23 | logger.setLevel(logging.INFO) 24 | 25 | 26 | def write_manifest(output_file, *filenames): 27 | """ Writes out a manifest file from a series of lists of filenames 28 | """ 29 | 30 | with open(output_file, "w") as fid: 31 | fid.write("@{}\n".format("\t".join(["FILE"] * len(filenames)))) 32 | for line in zip(*filenames): 33 | fid.write("\t".join(line) + "\n") 34 | 35 | return True 36 | 37 | 38 | def main(input_directory, transcript_directory, manifest_file): 39 | """ Finds all .flac files recursively in input_directory, then extracts the 40 | transcript from the nearby .trans.txt file and stores it in 41 | transcript_directory. Writes a manifest file referring to each .flac file 42 | and its paired transcript. 43 | 44 | Arguments: 45 | input_directory (string): Path to librispeech directory 46 | transcript_directory (string): Path to directory in which to write 47 | individual transcript files. 48 | manifest_file (string): Path to manifest file to output. 49 | """ 50 | 51 | def librispeech_flac_filename(filestr): 52 | parts = filestr.split("-") 53 | 54 | return os.path.join(input_directory, parts[0], parts[1], 55 | "{}.flac".format(filestr)) 56 | 57 | if not os.path.isdir(input_directory): 58 | raise IOError("Data directory does not exist! {}".format(input_directory)) 59 | 60 | if not os.path.exists(transcript_directory): 61 | os.makedirs(transcript_directory) 62 | 63 | transcript_files = glob.glob(os.path.join(input_directory, '*/*/*.txt')) 64 | if len(transcript_files) == 0: 65 | logger.error("No .txt files were found in {}".format(input_directory)) 66 | return 67 | 68 | logger.info("Beginning audio conversions") 69 | audio_files = list() 70 | txt_files = list() 71 | for ii, tfile in enumerate(transcript_files): 72 | # transcript file specifies transcript and flac filename for all librispeech files 73 | logger.info("Converting audio corresponding to transcript " 74 | "{} of {}".format(ii, len(transcript_files))) 75 | with open(tfile, "r") as fid: 76 | lines = fid.readlines() 77 | 78 | for line in lines: 79 | filestr, transcript = line.split(" ", 1) 80 | try: 81 | flac_file = librispeech_flac_filename(filestr) 82 | except IndexError: # filestr is not the format we are expecting 83 | print("filestr of unexpected formatting: {}".format(filestr)) 84 | print("error in {}".format(tfile)) 85 | continue 86 | txt_file = os.path.join(transcript_directory, 87 | "{}.txt".format(filestr)) 88 | 89 | # Write out short transcript file 90 | with open(txt_file, "w") as fid: 91 | fid.write(transcript.strip()) 92 | 93 | # Add to output lists to be written to manifest 94 | audio_files.append(flac_file) 95 | txt_files.append(txt_file) 96 | 97 | logger.info("Writing manifest file to {}".format(manifest_file)) 98 | return write_manifest(manifest_file, audio_files, txt_files) 99 | 100 | 101 | def convert_aeon_manifests(old_manifest, new_manifest): 102 | """Convert an Aeon < 1.0 manifest to an Aeon >= 1.0 manifest""" 103 | try: 104 | with open(old_manifest, "r") as old: 105 | with open(new_manifest, "w") as new: 106 | lines = old.readlines() 107 | nfields = len(lines[0].split(",")) 108 | new.write("@{}\n".format("\t".join(["FILE"] * nfields))) 109 | for line in lines: 110 | new_line = "\t".join(line.strip().split(",")) 111 | new.write("{}\n".format(new_line)) 112 | except: 113 | return False 114 | return True 115 | 116 | 117 | if __name__ == "__main__": 118 | import argparse 119 | parser = argparse.ArgumentParser() 120 | parser.add_argument("input_directory", 121 | help="Directory containing librispeech flac files") 122 | parser.add_argument("transcript_directory", 123 | help="Directory to write transcript .txt files") 124 | parser.add_argument("manifest_file", 125 | help="Output file that specifies the filename for each" 126 | " output audio and transcript") 127 | 128 | args = parser.parse_args() 129 | main(args.input_directory, 130 | args.transcript_directory, 131 | args.manifest_file) 132 | -------------------------------------------------------------------------------- /speech/decoder.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2015-2016 Nervana Systems Inc. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ---------------------------------------------------------------------------- 16 | 17 | import numpy as np 18 | import Levenshtein as Lev 19 | 20 | 21 | class Decoder(object): 22 | """ 23 | Basic decoder class from which all other decoders inherit. Implements several 24 | helper functions. Subclasses should implement the decode() method. 25 | 26 | Arguments: 27 | alphabet (string): mapping from integers to characters. 28 | blank_index (int, optional): index for the blank '_' character. Defaults to 0. 29 | space_index (int, optional): index for the space ' ' character. Defaults to 28. 30 | """ 31 | 32 | def __init__(self, alphabet, blank_index=0, space_index=1): 33 | # e.g. alphabet = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ#" 34 | self.alphabet = alphabet 35 | self.int_to_char = dict([(i, c) for (i, c) in enumerate(alphabet)]) 36 | self.blank_index = blank_index 37 | self.space_index = space_index 38 | 39 | def convert_to_string(self, sequence): 40 | "Given a numeric sequence, returns the corresponding string" 41 | 42 | return ''.join([self.int_to_char[i] for i in sequence]) 43 | 44 | def process_string(self, sequence, remove_repetitions=False): 45 | """ 46 | Given a string, removes blanks and replace space character with space. 47 | Option to remove repetitions (e.g. 'abbca' -> 'abca'). 48 | 49 | Arguments: 50 | sequence (array of int): 1-d array of integers 51 | remove_repetitions (boolean, optional): If true, repeating characters 52 | are removed. Defaults to False. 53 | """ 54 | string = '' 55 | 56 | for i, char in enumerate(sequence): 57 | if(char != self.int_to_char[self.blank_index]): 58 | # if this char is a repetition and remove_repetitions=true, 59 | # skip. 60 | if(remove_repetitions and i != 0 and char == sequence[i - 1]): 61 | pass 62 | elif(char == self.alphabet[self.space_index]): 63 | string = string + ' ' 64 | else: 65 | string = string + char 66 | 67 | return string 68 | 69 | def log_sum(self, list_of_probs): 70 | """ 71 | Computes the sum of log-probabilities. 72 | 73 | Arguments: 74 | list_of_probs (iterable): list of log-probabilities 75 | """ 76 | return np.log(np.sum([np.exp(p) for p in list_of_probs])) 77 | 78 | def wer(self, s1, s2): 79 | """ 80 | Computes the Word Error Rate, defined as the edit distance between the 81 | two provided sentences after tokenizing to words. 82 | Arguments: 83 | s1 (string): space-separated sentence 84 | s2 (string): space-separated sentence 85 | """ 86 | 87 | # build mapping of words to integers 88 | b = set(s1.split() + s2.split()) 89 | word2char = dict(zip(b, range(len(b)))) 90 | 91 | # map the words to a char array (Levenshtein packages only accepts 92 | # strings) 93 | w1 = [chr(word2char[w]) for w in s1.split()] 94 | w2 = [chr(word2char[w]) for w in s2.split()] 95 | 96 | return Lev.distance(''.join(w1), ''.join(w2)) 97 | 98 | def cer(self, s1, s2): 99 | """ 100 | Computes the Character Error Rate, defined as the edit distance. 101 | 102 | Arguments: 103 | s1 (string): space-separated sentence 104 | s2 (string): space-separated sentence 105 | """ 106 | return Lev.distance(s1, s2) 107 | 108 | def decode(self, probs): 109 | """ 110 | Given a matrix of character probabilities, returns the decoder's 111 | best guess of the transcription 112 | 113 | Arguments: 114 | probs (ndarray): Matrix of character probabilities, where probs[c,t] 115 | is the probability of character c at time t 116 | Returns: 117 | string: sequence of the model's best guess for the transcription 118 | 119 | """ 120 | raise NotImplementedError 121 | 122 | 123 | class ArgMaxDecoder(Decoder): 124 | 125 | def decode(self, probs): 126 | """ 127 | Returns the argmax decoding given the probability matrix. Removes 128 | repeated elements in the sequence, as well as blanks. 129 | """ 130 | string = self.convert_to_string(np.argmax(probs, axis=0)) 131 | return self.process_string(string, remove_repetitions=True) -------------------------------------------------------------------------------- /speech/evaluate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2015-2016 Nervana Systems Inc. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ---------------------------------------------------------------------------- 16 | 17 | import os 18 | import numpy as np 19 | import pickle as pkl 20 | 21 | from neon.backends import gen_backend 22 | from neon.util.argparser import NeonArgparser, extract_valid_args 23 | from neon.models import Model 24 | 25 | from decoder import ArgMaxDecoder 26 | from utils import get_wer 27 | 28 | from data.dataloader import make_loader 29 | 30 | # Parse the command line arguments 31 | arg_defaults = {'batch_size': 32} 32 | parser = NeonArgparser(__doc__, default_overrides=arg_defaults) 33 | parser.add_argument('--use_wer', action="store_true", 34 | help='compute wer instead of cer.') 35 | parser.add_argument('--inference_file', default=None, 36 | help='saves results in inference_file.') 37 | parser.add_argument('--print_examples', action="store_true", 38 | help='print an example transcript for each batch') 39 | args = parser.parse_args() 40 | 41 | if args.model_file is None: 42 | raise ArgumentError("A model file is required for evaluation") 43 | 44 | if "val" not in args.manifest: 45 | raise ArgumentError("Please provide an argument of the form:\n" + \ 46 | "--manifest val:/path/to/validation/manifest.csv") 47 | 48 | # Setup parameters for argmax decoder 49 | alphabet = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ " 50 | nout = len(alphabet) 51 | argmax_decoder = ArgMaxDecoder(alphabet, space_index=alphabet.index(" ")) 52 | 53 | # Initialize our backend 54 | be = gen_backend(**extract_valid_args(args, gen_backend)) 55 | 56 | # Setup dataloader 57 | eval_manifest = args.manifest['val'] 58 | if not os.path.exists(eval_manifest): 59 | raise IOError("Manifest file {} not found".format(eval_manifest)) 60 | 61 | # Setup required dataloader parameters 62 | nbands = 13 63 | max_utt_len = 30 64 | max_tscrpt_len = 1300 65 | eval_set = make_loader(eval_manifest, alphabet, nbands, max_tscrpt_len, 66 | max_utt_len, backend_obj=be) 67 | 68 | # Load the model 69 | model = Model(args.model_file) 70 | 71 | # Process data and compute stats 72 | wer, sample_size, results = get_wer(model, be, eval_set, argmax_decoder, nout, 73 | use_wer=args.use_wer, print_examples=args.print_examples) 74 | 75 | print("\n" + "-" * 80) 76 | if args.use_wer: 77 | print("wer = {}".format(wer)) 78 | else: 79 | print("cer = {}".format(wer)) 80 | print("-" * 80 + "\n") 81 | 82 | if args.inference_file: 83 | # Save results in args.inference_file 84 | with open(args.inference_file, 'wb') as f: 85 | pkl.dump((results, wer), f) 86 | print("Saved inference results to {}".format(args.inference_file)) 87 | -------------------------------------------------------------------------------- /speech/sample_proposals_callback.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2015-2016 Nervana Systems Inc. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ---------------------------------------------------------------------------- 16 | import numpy as np 17 | import sys 18 | from neon.callbacks.callbacks import Callback 19 | 20 | 21 | class WordErrorRateCallback(Callback): 22 | 23 | def __init__(self, eval_set, decoder, max_s, noise_label=None, epoch_freq=1): 24 | super(WordErrorRateCallback, self).__init__(epoch_freq=epoch_freq) 25 | 26 | self.eval_set = eval_set 27 | self.nout = len(decoder.alphabet) 28 | self.decoder = decoder 29 | if noise_label is None: 30 | self.noise_label = '' 31 | else: 32 | self.noise_label = noise_label 33 | 34 | def decrypt(self, decoder, message, noise_label): 35 | msg = decoder.convert_to_string(message) 36 | return decoder.process_string(msg, remove_repetitions=False 37 | ).replace(noise_label, '') 38 | 39 | def softmax(self, x): 40 | return (np.reciprocal(np.sum( 41 | np.exp(x - np.max(x, axis=0)), axis=0)) * 42 | np.exp(x - np.max(x, axis=0))) 43 | 44 | def dev_to_host(self, dev_tensor): 45 | if self.be.distribute_data(dev_tensor, "Disabled"): 46 | revert = True 47 | else: 48 | revert = False 49 | host_tensor = dev_tensor.get() 50 | if revert: 51 | self.be.revert_tensor(dev_tensor) 52 | return host_tensor 53 | 54 | def get_outputs(self, model, inputs): 55 | outputs = model.fprop(inputs, inference=True) 56 | return self.softmax(self.dev_to_host(outputs)).reshape( 57 | (self.nout, -1, self.be.bsz)).transpose((2, 0, 1)) 58 | 59 | def get_wer(self, model, dataset, noise_symbol=None): 60 | if noise_symbol is None: 61 | noise_symbol = '' 62 | cer = 0 63 | batch_count = 1e-10 64 | for x, y in dataset: 65 | batch_count += 1 66 | probs = self.get_outputs(model, x) 67 | strided_tmax = probs.shape[-1] 68 | flat_labels = self.dev_to_host(y[0])[0,:] 69 | tscrpt_lens = self.dev_to_host(y[1])[0, :] 70 | utt_lens = strided_tmax * self.dev_to_host(y[2])[0, :] / 100 71 | disp_indx = np.random.randint(self.be.bsz) 72 | for mu in range(self.be.bsz): 73 | prediction = self.decoder.decode(probs[mu, :, :int(utt_lens[mu])]) 74 | start = int(np.sum(tscrpt_lens[:mu])) 75 | target = flat_labels[start:start + tscrpt_lens[mu]].tolist() 76 | target = self.decrypt(self.decoder, target, noise_symbol) 77 | cer += self.decoder.cer(prediction, target) / (1.0 * len(target)) 78 | 79 | if mu == disp_indx: 80 | disp_proposal = prediction 81 | disp_target = target 82 | return cer / (batch_count * self.be.bsz), disp_proposal, disp_target 83 | 84 | def on_epoch_end(self, callback_data, model, epoch): 85 | cer, disp_proposal, disp_target = self.get_wer(model, self.eval_set) 86 | print(u"Proposal: {}".format(disp_proposal)) 87 | print(u"Target: {}".format(disp_target)) 88 | print("CER (validation) = {}".format(cer)) 89 | -------------------------------------------------------------------------------- /speech/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2015-2016 Nervana Systems Inc. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ---------------------------------------------------------------------------- 16 | """ 17 | Train ds2-style speech model on Librispeech 18 | """ 19 | 20 | import os 21 | import numpy as np 22 | 23 | from neon.backends import gen_backend 24 | from neon.callbacks.callbacks import Callbacks 25 | from neon.initializers import GlorotUniform, Constant, Gaussian 26 | from neon.layers import Conv, GeneralizedCost, Affine, DeepBiRNN 27 | from neon.models import Model 28 | from neon.transforms import Rectlin, Identity, Rectlinclip 29 | from neon.optimizers import GradientDescentMomentum 30 | from neon.util.argparser import NeonArgparser, extract_valid_args 31 | 32 | from ctc import CTC 33 | from decoder import ArgMaxDecoder 34 | from sample_proposals_callback import WordErrorRateCallback 35 | from data.dataloader import make_loader 36 | 37 | # Parse the command line arguments 38 | arg_defaults = {'batch_size': 32} 39 | 40 | parser = NeonArgparser(__doc__, default_overrides=arg_defaults) 41 | parser.add_argument('--nfilters', type=int, 42 | help='no. of conv filters', default=1152) 43 | parser.add_argument('--filter_width', type=int, 44 | help='width of conv filter', default=11) 45 | parser.add_argument('--str_w', type=int, help='stride in time', default=3) 46 | parser.add_argument('--depth', type=int, help='rnn depth', default=9) 47 | parser.add_argument('--hidden_size', type=int, 48 | help='affine/rnn hidden units', default=1152) 49 | parser.add_argument('--lr', type=float, 50 | help='learning rate', default=2e-5) 51 | parser.add_argument('--momentum', type=float, 52 | help='momentum', default=0.99) 53 | args = parser.parse_args() 54 | 55 | # Setup model hyperparameters 56 | # Convolution layer hyperparameters 57 | nfilters = args.nfilters # Number of convolutional filters 58 | filter_width = args.filter_width # Width of convolutional filters 59 | str_w = args.str_w # Convolutional filter stride 60 | 61 | # RNN hyperparameters 62 | depth = args.depth # Number of BiRNN layers 63 | hidden_size = args.hidden_size # Number of units in each BiRNN layer 64 | 65 | # Optimization hyperparameters 66 | learning_rate = args.lr 67 | momentum = args.momentum 68 | gradient_clip_norm = 400 69 | 70 | # Setup parameters for argmax decoder 71 | alphabet = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ " 72 | nout = len(alphabet) 73 | argmax_decoder = ArgMaxDecoder(alphabet, space_index=alphabet.index(" ")) 74 | 75 | # Initialize our backend 76 | be = gen_backend(**extract_valid_args(args, gen_backend)) 77 | 78 | # Setup dataloader 79 | nbands = 13 80 | max_tscrpt_len = 1300 81 | max_utt_len = 30 82 | 83 | train_manifest = args.manifest['train'] 84 | if not os.path.exists(train_manifest): 85 | raise RuntimeError( 86 | "training manifest file {} not found".format(train_manifest)) 87 | dev_manifest = args.manifest['val'] 88 | if not os.path.exists(dev_manifest): 89 | raise RuntimeError( 90 | "validation manifest file {} not found".format(dev_manifest)) 91 | 92 | train = make_loader(train_manifest, alphabet, nbands, max_tscrpt_len, max_utt_len, backend_obj=be) 93 | dev = make_loader(dev_manifest, alphabet, nbands, max_tscrpt_len, max_utt_len, backend_obj=be) 94 | 95 | # Setup the layers of the DNN 96 | # Softmax is performed in warp-ctc, so we use an Identity activation in the 97 | # final layer. 98 | gauss = Gaussian(scale=0.01) 99 | glorot = GlorotUniform() 100 | layers = [ 101 | Conv( 102 | (nbands, 103 | filter_width, 104 | nfilters), 105 | init=gauss, 106 | bias=Constant(0), 107 | activation=Rectlin(), 108 | padding=dict( 109 | pad_h=0, 110 | pad_w=5), 111 | strides=dict( 112 | str_h=1, 113 | str_w=str_w)), 114 | DeepBiRNN( 115 | hidden_size, 116 | init=glorot, 117 | activation=Rectlinclip(), 118 | batch_norm=True, 119 | reset_cells=True, 120 | depth=depth), 121 | Affine( 122 | hidden_size, 123 | init=glorot, 124 | activation=Rectlinclip()), 125 | Affine( 126 | nout=nout, 127 | init=glorot, 128 | activation=Identity())] 129 | 130 | model = Model(layers=layers) 131 | 132 | opt = GradientDescentMomentum(learning_rate, momentum, 133 | gradient_clip_norm=gradient_clip_norm, 134 | stochastic_round=False, 135 | nesterov=True) 136 | callbacks = Callbacks(model, eval_set=dev, **args.callback_args) 137 | 138 | # Print validation set word error rate at the end of every epoch 139 | pcb = WordErrorRateCallback(dev, argmax_decoder, max_tscrpt_len, epoch_freq=1) 140 | callbacks.add_callback(pcb) 141 | 142 | cost = GeneralizedCost(costfunc=CTC(max_tscrpt_len, nout=nout)) 143 | 144 | # Fit the model 145 | model.fit(train, optimizer=opt, num_epochs=args.epochs, 146 | cost=cost, callbacks=callbacks) 147 | -------------------------------------------------------------------------------- /speech/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2015-2016 Nervana Systems Inc. 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ---------------------------------------------------------------------------- 16 | 17 | import sys 18 | import numpy as np 19 | from tqdm import tqdm 20 | 21 | 22 | def softmax(x): 23 | return (np.reciprocal(np.sum( 24 | np.exp(x - np.max(x, axis=0)), axis=0)) * 25 | np.exp(x - np.max(x, axis=0))) 26 | 27 | def get_outputs(model, be, inputs, nout): 28 | outputs = model.fprop(inputs, inference=False) 29 | return softmax(outputs.get()).reshape( 30 | (nout, -1, be.bsz)).transpose((2, 0, 1)) 31 | 32 | def eval_model(model, dataset, nout, bsz): 33 | return [((get_outputs(model, x, nout, bsz), 34 | y[0]), y[2]) for (x, y) in dataset] 35 | 36 | def decrypt(decoder, message): 37 | msg = decoder.convert_to_string(message) 38 | return decoder.process_string(msg, remove_repetitions=False) 39 | 40 | def get_wer(model, be, dataset, decoder, nout, use_wer=False, print_examples=False): 41 | wer = 0 42 | batchcount = 0 43 | predictions = list() 44 | targets = list() 45 | nbatches = dataset.nbatches 46 | 47 | if not model.initialized: 48 | model.initialize(dataset) 49 | 50 | progress_bar = tqdm(dataset, total=nbatches, unit="batches") 51 | for x, y in progress_bar: 52 | probs = get_outputs(model, be, x, nout) 53 | strided_tmax = probs.shape[-1] 54 | flat_labels = y[0].get().ravel() 55 | tscrpt_lens = y[1].get().ravel() 56 | utt_lens = strided_tmax * y[2].get().ravel() / 100 57 | for mu in range(be.bsz): 58 | prediction = decoder.decode(probs[mu, :, :int(utt_lens[mu])]) 59 | start = int(np.sum(tscrpt_lens[:mu])) 60 | target = flat_labels[start:start + tscrpt_lens[mu]].tolist() 61 | target = decrypt(decoder, target) 62 | predictions.append(prediction) 63 | targets.append(target) 64 | if not use_wer: 65 | wer += decoder.cer(prediction, target) / float(len(target)) 66 | else: 67 | wer += decoder.wer(prediction, target) / \ 68 | float(len(target.split())) 69 | 70 | if use_wer: 71 | progress_bar.set_description("WER: {}".format(wer / len(predictions))) 72 | else: 73 | progress_bar.set_description("CER: {}".format(wer / len(predictions))) 74 | if print_examples is True: 75 | progress_bar.write("Transcribed: {}".format(predictions[-1])) 76 | progress_bar.write("Target: {}".format(targets[-1])) 77 | 78 | results = zip(predictions, targets) 79 | nsamples = len(predictions) 80 | return wer / nsamples, nsamples , results 81 | -------------------------------------------------------------------------------- /src/transforms/.gitignore: -------------------------------------------------------------------------------- 1 | warp-ctc/build -------------------------------------------------------------------------------- /src/transforms/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: default clean 2 | 3 | default: libwarpctc.so 4 | 5 | OS := $(shell uname) 6 | 7 | clean: 8 | @rm -rf warp-ctc/build 9 | @rm -rf libwarpctc.so 10 | 11 | libwarpctc.so: warp-ctc/build/libwarpctc.so 12 | @ln -sf warp-ctc/build/libwarpctc.so libwarpctc.so 13 | 14 | 15 | warp-ctc/build/libwarpctc.so: 16 | @rm -rf warp-ctc/build 17 | @mkdir warp-ctc/build 18 | @cd warp-ctc/build && cmake .. && make 19 | ifeq ($(OS),Darwin) 20 | @cd warp-ctc/build && ln -sf libwarpctc.dylib libwarpctc.so 21 | endif 22 | 23 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | IF (APPLE) 2 | cmake_minimum_required(VERSION 3.4) 3 | ELSE() 4 | cmake_minimum_required(VERSION 2.8) 5 | ENDIF() 6 | 7 | project(ctc_release) 8 | 9 | IF (NOT APPLE) 10 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -O2 -g") 11 | ENDIF() 12 | 13 | IF (APPLE) 14 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2 -g") 15 | add_definitions(-DAPPLE) 16 | ENDIF() 17 | 18 | include_directories(include) 19 | 20 | FIND_PACKAGE(CUDA 6.5) 21 | MESSAGE(STATUS "cuda found ${CUDA_FOUND}") 22 | 23 | # need to be at least 30 or __shfl_down in reduce wont compile 24 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2") 25 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35") 26 | 27 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50") 28 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52") 29 | 30 | # https://github.com/baidu-research/warp-ctc/commit/ecc7ed2f65becf8946ebff8c59b7e1eeeef44334 31 | IF (CUDA_VERSION GREATER 7.6) 32 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60") 33 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61") 34 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62") 35 | ENDIF() 36 | 37 | if (NOT APPLE) 38 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11") 39 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp") 40 | ENDIF() 41 | 42 | IF (APPLE) 43 | EXEC_PROGRAM(uname ARGS -v OUTPUT_VARIABLE DARWIN_VERSION) 44 | STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION}) 45 | MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}") 46 | 47 | #for el capitain have to use rpath 48 | 49 | IF (DARWIN_VERSION LESS 15) 50 | set(CMAKE_SKIP_RPATH TRUE) 51 | ENDIF () 52 | 53 | ELSE() 54 | #always skip for linux 55 | set(CMAKE_SKIP_RPATH TRUE) 56 | ENDIF() 57 | 58 | 59 | IF (CUDA_FOUND) 60 | 61 | MESSAGE(STATUS "Building shared library with GPU support") 62 | 63 | CUDA_ADD_LIBRARY(warpctc SHARED src/ctc_entrypoint.cu src/reduce.cu) 64 | TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY}) 65 | 66 | add_executable(test_cpu tests/test_cpu.cpp ) 67 | TARGET_LINK_LIBRARIES(test_cpu warpctc) 68 | SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} -g --std=c++11") 69 | 70 | cuda_add_executable(test_gpu tests/test_gpu.cu) 71 | TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY}) 72 | 73 | INSTALL(TARGETS warpctc 74 | RUNTIME DESTINATION bin 75 | LIBRARY DESTINATION lib 76 | ARCHIVE DESTINATION lib/static) 77 | 78 | ELSE() 79 | MESSAGE(STATUS "Building shared library with no GPU support") 80 | 81 | if (NOT APPLE) 82 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2") 83 | ENDIF() 84 | 85 | ADD_LIBRARY(warpctc SHARED src/ctc_entrypoint.cpp) 86 | 87 | add_executable(test_cpu tests/test_cpu.cpp ) 88 | TARGET_LINK_LIBRARIES(test_cpu warpctc) 89 | SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11") 90 | 91 | INSTALL(TARGETS warpctc 92 | RUNTIME DESTINATION bin 93 | LIBRARY DESTINATION lib 94 | ARCHIVE DESTINATION lib/static) 95 | ENDIF() 96 | 97 | 98 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2015-2016 Baidu USA LLC. All rights reserved. 2 | 3 | Apache License 4 | Version 2.0, January 2004 5 | http://www.apache.org/licenses/ 6 | 7 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 8 | 9 | 1. Definitions. 10 | 11 | "License" shall mean the terms and conditions for use, reproduction, 12 | and distribution as defined by Sections 1 through 9 of this document. 13 | 14 | "Licensor" shall mean the copyright owner or entity authorized by 15 | the copyright owner that is granting the License. 16 | 17 | "Legal Entity" shall mean the union of the acting entity and all 18 | other entities that control, are controlled by, or are under common 19 | control with that entity. For the purposes of this definition, 20 | "control" means (i) the power, direct or indirect, to cause the 21 | direction or management of such entity, whether by contract or 22 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 23 | outstanding shares, or (iii) beneficial ownership of such entity. 24 | 25 | "You" (or "Your") shall mean an individual or Legal Entity 26 | exercising permissions granted by this License. 27 | 28 | "Source" form shall mean the preferred form for making modifications, 29 | including but not limited to software source code, documentation 30 | source, and configuration files. 31 | 32 | "Object" form shall mean any form resulting from mechanical 33 | transformation or translation of a Source form, including but 34 | not limited to compiled object code, generated documentation, 35 | and conversions to other media types. 36 | 37 | "Work" shall mean the work of authorship, whether in Source or 38 | Object form, made available under the License, as indicated by a 39 | copyright notice that is included in or attached to the work 40 | (an example is provided in the Appendix below). 41 | 42 | "Derivative Works" shall mean any work, whether in Source or Object 43 | form, that is based on (or derived from) the Work and for which the 44 | editorial revisions, annotations, elaborations, or other modifications 45 | represent, as a whole, an original work of authorship. For the purposes 46 | of this License, Derivative Works shall not include works that remain 47 | separable from, or merely link (or bind by name) to the interfaces of, 48 | the Work and Derivative Works thereof. 49 | 50 | "Contribution" shall mean any work of authorship, including 51 | the original version of the Work and any modifications or additions 52 | to that Work or Derivative Works thereof, that is intentionally 53 | submitted to Licensor for inclusion in the Work by the copyright owner 54 | or by an individual or Legal Entity authorized to submit on behalf of 55 | the copyright owner. For the purposes of this definition, "submitted" 56 | means any form of electronic, verbal, or written communication sent 57 | to the Licensor or its representatives, including but not limited to 58 | communication on electronic mailing lists, source code control systems, 59 | and issue tracking systems that are managed by, or on behalf of, the 60 | Licensor for the purpose of discussing and improving the Work, but 61 | excluding communication that is conspicuously marked or otherwise 62 | designated in writing by the copyright owner as "Not a Contribution." 63 | 64 | "Contributor" shall mean Licensor and any individual or Legal Entity 65 | on behalf of whom a Contribution has been received by Licensor and 66 | subsequently incorporated within the Work. 67 | 68 | 2. Grant of Copyright License. Subject to the terms and conditions of 69 | this License, each Contributor hereby grants to You a perpetual, 70 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 71 | copyright license to reproduce, prepare Derivative Works of, 72 | publicly display, publicly perform, sublicense, and distribute the 73 | Work and such Derivative Works in Source or Object form. 74 | 75 | 3. Grant of Patent License. Subject to the terms and conditions of 76 | this License, each Contributor hereby grants to You a perpetual, 77 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 78 | (except as stated in this section) patent license to make, have made, 79 | use, offer to sell, sell, import, and otherwise transfer the Work, 80 | where such license applies only to those patent claims licensable 81 | by such Contributor that are necessarily infringed by their 82 | Contribution(s) alone or by combination of their Contribution(s) 83 | with the Work to which such Contribution(s) was submitted. If You 84 | institute patent litigation against any entity (including a 85 | cross-claim or counterclaim in a lawsuit) alleging that the Work 86 | or a Contribution incorporated within the Work constitutes direct 87 | or contributory patent infringement, then any patent licenses 88 | granted to You under this License for that Work shall terminate 89 | as of the date such litigation is filed. 90 | 91 | 4. Redistribution. You may reproduce and distribute copies of the 92 | Work or Derivative Works thereof in any medium, with or without 93 | modifications, and in Source or Object form, provided that You 94 | meet the following conditions: 95 | 96 | (a) You must give any other recipients of the Work or 97 | Derivative Works a copy of this License; and 98 | 99 | (b) You must cause any modified files to carry prominent notices 100 | stating that You changed the files; and 101 | 102 | (c) You must retain, in the Source form of any Derivative Works 103 | that You distribute, all copyright, patent, trademark, and 104 | attribution notices from the Source form of the Work, 105 | excluding those notices that do not pertain to any part of 106 | the Derivative Works; and 107 | 108 | (d) If the Work includes a "NOTICE" text file as part of its 109 | distribution, then any Derivative Works that You distribute must 110 | include a readable copy of the attribution notices contained 111 | within such NOTICE file, excluding those notices that do not 112 | pertain to any part of the Derivative Works, in at least one 113 | of the following places: within a NOTICE text file distributed 114 | as part of the Derivative Works; within the Source form or 115 | documentation, if provided along with the Derivative Works; or, 116 | within a display generated by the Derivative Works, if and 117 | wherever such third-party notices normally appear. The contents 118 | of the NOTICE file are for informational purposes only and 119 | do not modify the License. You may add Your own attribution 120 | notices within Derivative Works that You distribute, alongside 121 | or as an addendum to the NOTICE text from the Work, provided 122 | that such additional attribution notices cannot be construed 123 | as modifying the License. 124 | 125 | You may add Your own copyright statement to Your modifications and 126 | may provide additional or different license terms and conditions 127 | for use, reproduction, or distribution of Your modifications, or 128 | for any such Derivative Works as a whole, provided Your use, 129 | reproduction, and distribution of the Work otherwise complies with 130 | the conditions stated in this License. 131 | 132 | 5. Submission of Contributions. Unless You explicitly state otherwise, 133 | any Contribution intentionally submitted for inclusion in the Work 134 | by You to the Licensor shall be under the terms and conditions of 135 | this License, without any additional terms or conditions. 136 | Notwithstanding the above, nothing herein shall supersede or modify 137 | the terms of any separate license agreement you may have executed 138 | with Licensor regarding such Contributions. 139 | 140 | 6. Trademarks. This License does not grant permission to use the trade 141 | names, trademarks, service marks, or product names of the Licensor, 142 | except as required for reasonable and customary use in describing the 143 | origin of the Work and reproducing the content of the NOTICE file. 144 | 145 | 7. Disclaimer of Warranty. Unless required by applicable law or 146 | agreed to in writing, Licensor provides the Work (and each 147 | Contributor provides its Contributions) on an "AS IS" BASIS, 148 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 149 | implied, including, without limitation, any warranties or conditions 150 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 151 | PARTICULAR PURPOSE. You are solely responsible for determining the 152 | appropriateness of using or redistributing the Work and assume any 153 | risks associated with Your exercise of permissions under this License. 154 | 155 | 8. Limitation of Liability. In no event and under no legal theory, 156 | whether in tort (including negligence), contract, or otherwise, 157 | unless required by applicable law (such as deliberate and grossly 158 | negligent acts) or agreed to in writing, shall any Contributor be 159 | liable to You for damages, including any direct, indirect, special, 160 | incidental, or consequential damages of any character arising as a 161 | result of this License or out of the use or inability to use the 162 | Work (including but not limited to damages for loss of goodwill, 163 | work stoppage, computer failure or malfunction, or any and all 164 | other commercial damages or losses), even if such Contributor 165 | has been advised of the possibility of such damages. 166 | 167 | 9. Accepting Warranty or Additional Liability. While redistributing 168 | the Work or Derivative Works thereof, You may choose to offer, 169 | and charge a fee for, acceptance of support, warranty, indemnity, 170 | or other liability obligations and/or rights consistent with this 171 | License. However, in accepting such obligations, You may act only 172 | on Your own behalf and on Your sole responsibility, not on behalf 173 | of any other Contributor, and only if You agree to indemnify, 174 | defend, and hold each Contributor harmless for any liability 175 | incurred by, or claims asserted against, such Contributor by reason 176 | of your accepting any such warranty or additional liability. 177 | 178 | END OF TERMS AND CONDITIONS 179 | 180 | APPENDIX: How to apply the Apache License to your work. 181 | 182 | To apply the Apache License to your work, attach the following 183 | boilerplate notice, with the fields enclosed by brackets "[]" 184 | replaced with your own identifying information. (Don't include 185 | the brackets!) The text should be enclosed in the appropriate 186 | comment syntax for the file format. We also recommend that a 187 | file or class name and description of purpose be included on the 188 | same "printed page" as the copyright notice for easier 189 | identification within third-party archives. 190 | 191 | Copyright 2015-2016, Baidu USA LLC. 192 | 193 | Licensed under the Apache License, Version 2.0 (the "License"); 194 | you may not use this file except in compliance with the License. 195 | You may obtain a copy of the License at 196 | 197 | http://www.apache.org/licenses/LICENSE-2.0 198 | 199 | Unless required by applicable law or agreed to in writing, software 200 | distributed under the License is distributed on an "AS IS" BASIS, 201 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 202 | See the License for the specific language governing permissions and 203 | limitations under the License. -------------------------------------------------------------------------------- /src/transforms/warp-ctc/README.md: -------------------------------------------------------------------------------- 1 | # Instructions 2 | 3 | Install warp-ctc first using cmake 4 | 5 | - `mkdir build` 6 | - `cd build` 7 | - `cmake ..` 8 | - `make` 9 | 10 | Now, go to the python directory and run `sudo python setup.py install` 11 | 12 | # Credits 13 | 14 | https://github.com/baidu-research/warp-ctc 15 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/doc/baidu-research-logo-small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NervanaSystems/deepspeech/2ea95f5ce1bb39fa4de26807beed905b7889de59/src/transforms/warp-ctc/doc/baidu-research-logo-small.png -------------------------------------------------------------------------------- /src/transforms/warp-ctc/doc/deep-speech-ctc-small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NervanaSystems/deepspeech/2ea95f5ce1bb39fa4de26807beed905b7889de59/src/transforms/warp-ctc/doc/deep-speech-ctc-small.png -------------------------------------------------------------------------------- /src/transforms/warp-ctc/examples/loader.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | class DataLoader: 5 | def __init__(self, mbsz=128, min_len=20, max_len=30, num_classes=29): 6 | self.mbsz = mbsz 7 | self.min_len = min_len 8 | self.max_len = max_len 9 | self.num_classes = num_classes 10 | 11 | def sample(self): 12 | inputs = [] 13 | input_lens = [] 14 | outputs = [] 15 | output_lens = [] 16 | for i in xrange(self.mbsz): 17 | length = random.randint(self.min_len, self.max_len) 18 | input_lens.append(length) 19 | input = [random.randint(1, self.num_classes-1) for j in xrange(length)] 20 | #output = input[:] # identity output 21 | output = input[::4] # every 4th input is output 22 | """ 23 | # for acronym output 24 | output = [] 25 | flag = True 26 | for j in xrange(len(input)): 27 | if input[j] == 1: 28 | flag = True 29 | elif flag == True: 30 | flag = False 31 | output.append(input[j]) 32 | """ 33 | output_lens.append(len(output)) 34 | inputs.append(input) 35 | outputs.append(output) 36 | 37 | input_arr = np.zeros((self.mbsz, self.max_len, self.num_classes)) 38 | for i in xrange(self.mbsz): 39 | for j in xrange(len(inputs[i])): 40 | input_arr[i, j, inputs[i][j]] = 1.0 41 | label_arr = np.zeros((sum(output_lens)), dtype=np.int32) 42 | pos = 0 43 | for i in xrange(self.mbsz): 44 | label_arr[pos:pos+output_lens[i]] = outputs[i] 45 | pos += output_lens[i] 46 | 47 | return input_arr, np.array(input_lens, dtype=np.int32), label_arr, np.array(output_lens, dtype=np.int32) 48 | 49 | 50 | if __name__ == '__main__': 51 | dl = DataLoader() 52 | ret = dl.sample() 53 | print ret[0].shape 54 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/examples/rnnctc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | import lasagne 5 | 6 | import ctc 7 | 8 | num_classes = 5 9 | mbsz = 1 10 | min_len = 12 11 | max_len = 12 12 | n_hidden = 100 13 | grad_clip = 100 14 | 15 | input_lens = T.ivector('input_lens') 16 | output = T.ivector('output') 17 | output_lens = T.ivector('output_lens') 18 | 19 | l_in = lasagne.layers.InputLayer(shape=(mbsz, max_len, num_classes)) 20 | 21 | h1f = lasagne.layers.RecurrentLayer(l_in, n_hidden, grad_clipping=grad_clip, 22 | nonlinearity=lasagne.nonlinearities.rectify) 23 | h1b = lasagne.layers.RecurrentLayer(l_in, n_hidden, grad_clipping=grad_clip, 24 | nonlinearity=lasagne.nonlinearities.rectify, backwards = True) 25 | h1 = lasagne.layers.ElemwiseSumLayer([h1f, h1b]) 26 | 27 | h2f = lasagne.layers.RecurrentLayer(h1, n_hidden, grad_clipping=grad_clip, 28 | nonlinearity=lasagne.nonlinearities.rectify) 29 | h2b = lasagne.layers.RecurrentLayer(h1, n_hidden, grad_clipping=grad_clip, 30 | nonlinearity=lasagne.nonlinearities.rectify, backwards = True) 31 | h2 = lasagne.layers.ElemwiseSumLayer([h2f, h2b]) 32 | 33 | h3 = lasagne.layers.RecurrentLayer(h2, num_classes, grad_clipping=grad_clip, 34 | nonlinearity=lasagne.nonlinearities.linear) 35 | # Turn into 36 | l_out = lasagne.layers.DimshuffleLayer(h3, (1, 0, 2)) 37 | 38 | network_output = lasagne.layers.get_output(l_out) 39 | 40 | cost = T.mean(ctc.cpu_ctc_th(network_output, input_lens, output, output_lens)) 41 | grads = T.grad(cost, wrt=network_output) 42 | all_params = lasagne.layers.get_all_params(l_out) 43 | updates = lasagne.updates.adam(cost, all_params, 0.001) 44 | 45 | train = theano.function([l_in.input_var, input_lens, output, output_lens], cost, updates=updates) 46 | predict = theano.function([l_in.input_var], network_output) 47 | get_grad = theano.function([l_in.input_var, input_lens, output, output_lens], grads) 48 | 49 | from loader import DataLoader 50 | data_loader = DataLoader(mbsz=mbsz, min_len=min_len, max_len=max_len, num_classes=num_classes) 51 | 52 | i = 1 53 | while True: 54 | i += 1 55 | print i 56 | sample = data_loader.sample() 57 | cost = train(*sample) 58 | out = predict(sample[0]) 59 | print cost 60 | print "input", sample[0][0].argmax(1) 61 | print "prediction", out[:, 0].argmax(1) 62 | print "expected", sample[2][:sample[3][0]] 63 | if i == 10000: 64 | grads = get_grad(*sample) 65 | import ipdb; ipdb.set_trace() 66 | 67 | 68 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/examples/simple.py: -------------------------------------------------------------------------------- 1 | from ctc import cpu_ctc_th, cpu_ctc_np 2 | import numpy as np 3 | import theano 4 | import theano.tensor as T 5 | 6 | acts = np.array([[[0.1, 0.6, 0.1, 0.1, 0.1]], 7 | [[0.1, 0.1, 0.6, 0.1, 0.1]]]) 8 | 9 | labels = np.array([1, 2]) 10 | label_lens = np.array([2]) 11 | act_lens = np.array([2]) 12 | cost, grads = cpu_ctc_np(acts, act_lens, labels, label_lens) 13 | print "expected cost:", 2.46285844 14 | 15 | print "cost (numpy):", cost.sum() 16 | print "grads (numpy):", grads 17 | 18 | def create_theano_func(): 19 | acts = T.ftensor3() 20 | act_lens = T.ivector() 21 | labels = T.ivector() 22 | label_lens = T.ivector() 23 | costs = cpu_ctc_th(acts, act_lens, labels, label_lens) 24 | cost = T.mean(costs) 25 | grads = T.grad(cost, acts) 26 | f = theano.function([acts, act_lens, labels, label_lens], cost, allow_input_downcast=True) 27 | g = theano.function([acts, act_lens, labels, label_lens], grads, allow_input_downcast=True) 28 | return f, g 29 | 30 | f, g = create_theano_func() 31 | print "cost (theano):", f(acts, act_lens, labels, label_lens).sum() 32 | print "grads (theano)", g(acts, act_lens, labels, label_lens) 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/include/contrib/moderngpu/LICENSE: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/include/contrib/moderngpu/include/device/ctaloadbalance.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "ctasearch.cuh" 38 | #include "loadstore.cuh" 39 | 40 | namespace mgpu { 41 | 42 | //////////////////////////////////////////////////////////////////////////////// 43 | // DeviceLoadBalancingSearch 44 | // Upper Bound search from A (needles) into B (haystack). The A values are 45 | // natural numbers from aBegin to aEnd. bFirst is the index of the B value at 46 | // bBegin in shared memory. 47 | 48 | template 49 | MGPU_DEVICE void DeviceSerialLoadBalanceSearch(const int* b_shared, int aBegin, 50 | int aEnd, int bFirst, int bBegin, int bEnd, int* a_shared) { 51 | 52 | int bKey = b_shared[bBegin]; 53 | 54 | #pragma unroll 55 | for(int i = 0; i < VT; ++i) { 56 | bool p; 57 | if(RangeCheck) 58 | p = (aBegin < aEnd) && ((bBegin >= bEnd) || (aBegin < bKey)); 59 | else 60 | p = aBegin < bKey; 61 | 62 | if(p) 63 | // Advance A (the needle). 64 | a_shared[aBegin++] = bFirst + bBegin; 65 | else 66 | // Advance B (the haystack). 67 | bKey = b_shared[++bBegin]; 68 | } 69 | } 70 | 71 | //////////////////////////////////////////////////////////////////////////////// 72 | // CTALoadBalance 73 | // Computes upper_bound(counting_iterator(first), b_global) - 1. 74 | 75 | // Unlike most other CTA* functions, CTALoadBalance loads from global memory. 76 | // This returns the loaded B elements at the beginning or end of shared memory 77 | // depending on the aFirst argument. 78 | 79 | // CTALoadBalance requires NT * VT + 2 slots of shared memory. 80 | template 81 | MGPU_DEVICE int4 CTALoadBalance(int destCount, InputIt b_global, 82 | int sourceCount, int block, int tid, const int* mp_global, 83 | int* indices_shared, bool loadPrecedingB) { 84 | 85 | int4 range = ComputeMergeRange(destCount, sourceCount, block, 0, NT * VT, 86 | mp_global); 87 | 88 | int a0 = range.x; 89 | int a1 = range.y; 90 | int b0 = range.z; 91 | int b1 = range.w; 92 | if(!b0) loadPrecedingB = false; 93 | 94 | // Load one trailing term from B. If we're already at the end, fill the 95 | // end of the buffer with destCount. 96 | int aCount = a1 - a0; 97 | int bCount = b1 - b0; 98 | int extended = b1 < sourceCount; 99 | int loadCount = bCount + extended; 100 | int fillCount = NT * VT + 1 - loadCount - aCount; 101 | 102 | int* a_shared = indices_shared; 103 | int* b_shared = indices_shared + aCount + (int)loadPrecedingB; 104 | 105 | // Load the B values. 106 | // DeviceMemToMemLoop(bCount + extended + (int)loadPrecedingB, 107 | // b_global + b0 - (int)loadPrecedingB, tid, 108 | // b_shared - (int)loadPrecedingB); 109 | 110 | for(int i = tid - (int)loadPrecedingB; i < bCount + extended; i += NT) 111 | b_shared[i] = b_global[b0 + i]; 112 | 113 | // Fill the end of the array with destCount. 114 | for(int i = tid + extended; i < fillCount; i += NT) 115 | b_shared[bCount + i] = destCount; 116 | __syncthreads(); 117 | 118 | // Run a merge path to find the start of the serial merge for each thread. 119 | int diag = VT * tid; 120 | int mp = MergePath(mgpu::counting_iterator(a0), 121 | aCount, b_shared, bCount, diag, mgpu::less()); 122 | 123 | int a0tid = a0 + mp; 124 | int b0tid = diag - mp; 125 | 126 | // Subtract 1 from b0 because we want to return upper_bound - 1. 127 | DeviceSerialLoadBalanceSearch(b_shared, a0tid, a1, b0 - 1, 128 | b0tid, bCount, a_shared - a0); 129 | __syncthreads(); 130 | 131 | b0 -= (int)loadPrecedingB; 132 | return make_int4(a0, a1, b0, b1); 133 | } 134 | 135 | 136 | } // namespace mgpu 137 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/include/contrib/moderngpu/include/device/ctascan.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "../mgpuenums.h" 38 | #include "deviceutil.cuh" 39 | #include "intrinsics.cuh" 40 | 41 | namespace mgpu { 42 | 43 | //////////////////////////////////////////////////////////////////////////////// 44 | // CTAReduce 45 | 46 | template > 47 | struct CTAReduce { 48 | typedef typename Op::first_argument_type T; 49 | enum { Size = NT, Capacity = NT }; 50 | struct Storage { T shared[Capacity]; }; 51 | 52 | MGPU_DEVICE static T Reduce(int tid, T x, Storage& storage, Op op = Op()) { 53 | storage.shared[tid] = x; 54 | __syncthreads(); 55 | 56 | // Fold the data in half with each pass. 57 | #pragma unroll 58 | for(int destCount = NT / 2; destCount >= 1; destCount /= 2) { 59 | if(tid < destCount) { 60 | // Read from the right half and store to the left half. 61 | x = op(x, storage.shared[destCount + tid]); 62 | storage.shared[tid] = x; 63 | } 64 | __syncthreads(); 65 | } 66 | T total = storage.shared[0]; 67 | __syncthreads(); 68 | return total; 69 | } 70 | }; 71 | 72 | #if __CUDA_ARCH__ >= 300 73 | 74 | template 75 | struct CTAReduce > { 76 | typedef mgpu::plus Op; 77 | typedef int T; 78 | enum { Size = NT, Capacity = WARP_SIZE }; 79 | struct Storage { int shared[Capacity]; }; 80 | 81 | MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage, 82 | Op op = Op()) { 83 | 84 | const int NumSections = WARP_SIZE; 85 | const int SecSize = NT / NumSections; 86 | int lane = (SecSize - 1) & tid; 87 | int sec = tid / SecSize; 88 | 89 | // In the first phase, threads cooperatively find the reduction within 90 | // their segment. The segments are SecSize threads (NT / WARP_SIZE) 91 | // wide. 92 | #pragma unroll 93 | for(int offset = 1; offset < SecSize; offset *= 2) 94 | x = shfl_add(x, offset, SecSize); 95 | 96 | // The last thread in each segment stores the local reduction to shared 97 | // memory. 98 | if(SecSize - 1 == lane) storage.shared[sec] = x; 99 | __syncthreads(); 100 | 101 | // Reduce the totals of each input segment. The spine is WARP_SIZE 102 | // threads wide. 103 | if(tid < NumSections) { 104 | x = storage.shared[tid]; 105 | #pragma unroll 106 | for(int offset = 1; offset < NumSections; offset *= 2) 107 | x = shfl_add(x, offset, NumSections); 108 | storage.shared[tid] = x; 109 | } 110 | __syncthreads(); 111 | 112 | int reduction = storage.shared[NumSections - 1]; 113 | __syncthreads(); 114 | 115 | return reduction; 116 | } 117 | }; 118 | 119 | template 120 | struct CTAReduce > { 121 | typedef mgpu::maximum Op; 122 | enum { Size = NT, Capacity = WARP_SIZE }; 123 | struct Storage { int shared[Capacity]; }; 124 | 125 | MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage, 126 | Op op = Op()) { 127 | 128 | const int NumSections = WARP_SIZE; 129 | const int SecSize = NT / NumSections; 130 | int lane = (SecSize - 1) & tid; 131 | int sec = tid / SecSize; 132 | 133 | #pragma unroll 134 | for(int offset = 1; offset < SecSize; offset *= 2) 135 | x = shfl_max(x, offset, SecSize); 136 | 137 | if(SecSize - 1 == lane) storage.shared[sec] = x; 138 | __syncthreads(); 139 | 140 | if(tid < NumSections) { 141 | x = storage.shared[tid]; 142 | #pragma unroll 143 | for(int offset = 1; offset < NumSections; offset *= 2) 144 | x = shfl_max(x, offset, NumSections); 145 | storage.shared[tid] = x; 146 | } 147 | __syncthreads(); 148 | 149 | int reduction = storage.shared[NumSections - 1]; 150 | __syncthreads(); 151 | 152 | return reduction; 153 | } 154 | }; 155 | 156 | #endif // __CUDA_ARCH__ >= 300 157 | 158 | //////////////////////////////////////////////////////////////////////////////// 159 | // CTAScan 160 | 161 | template > 162 | struct CTAScan { 163 | typedef typename Op::result_type T; 164 | enum { Size = NT, Capacity = 2 * NT + 1 }; 165 | struct Storage { T shared[Capacity]; }; 166 | 167 | MGPU_DEVICE static T Scan(int tid, T x, Storage& storage, T* total, 168 | MgpuScanType type = MgpuScanTypeExc, T identity = (T)0, Op op = Op()) { 169 | 170 | storage.shared[tid] = x; 171 | int first = 0; 172 | __syncthreads(); 173 | 174 | #pragma unroll 175 | for(int offset = 1; offset < NT; offset += offset) { 176 | if(tid >= offset) 177 | x = op(storage.shared[first + tid - offset], x); 178 | first = NT - first; 179 | storage.shared[first + tid] = x; 180 | __syncthreads(); 181 | } 182 | *total = storage.shared[first + NT - 1]; 183 | 184 | if(MgpuScanTypeExc == type) 185 | x = tid ? storage.shared[first + tid - 1] : identity; 186 | 187 | __syncthreads(); 188 | return x; 189 | } 190 | MGPU_DEVICE static T Scan(int tid, T x, Storage& storage) { 191 | T total; 192 | return Scan(tid, x, storage, &total, MgpuScanTypeExc, (T)0, Op()); 193 | } 194 | }; 195 | 196 | //////////////////////////////////////////////////////////////////////////////// 197 | // Special partial specialization for CTAScan on Kepler. 198 | // This uses the shfl intrinsic to reduce scan latency. 199 | 200 | #if __CUDA_ARCH__ >= 300 201 | 202 | template 203 | struct CTAScan > { 204 | typedef mgpu::plus Op; 205 | enum { Size = NT, NumSegments = WARP_SIZE, SegSize = NT / NumSegments }; 206 | enum { Capacity = NumSegments + 1 }; 207 | struct Storage { int shared[Capacity + 1]; }; 208 | 209 | MGPU_DEVICE static int Scan(int tid, int x, Storage& storage, int* total, 210 | MgpuScanType type = MgpuScanTypeExc, int identity = 0, Op op = Op()) { 211 | 212 | // Define WARP_SIZE segments that are NT / WARP_SIZE large. 213 | // Each warp makes log(SegSize) shfl_add calls. 214 | // The spine makes log(WARP_SIZE) shfl_add calls. 215 | int lane = (SegSize - 1) & tid; 216 | int segment = tid / SegSize; 217 | 218 | // Scan each segment using shfl_add. 219 | int scan = x; 220 | #pragma unroll 221 | for(int offset = 1; offset < SegSize; offset *= 2) 222 | scan = shfl_add(scan, offset, SegSize); 223 | 224 | // Store the reduction (last element) of each segment into storage. 225 | if(SegSize - 1 == lane) storage.shared[segment] = scan; 226 | __syncthreads(); 227 | 228 | // Warp 0 does a full shfl warp scan on the partials. The total is 229 | // stored to shared[NumSegments]. (NumSegments = WARP_SIZE) 230 | if(tid < NumSegments) { 231 | int y = storage.shared[tid]; 232 | int scan = y; 233 | #pragma unroll 234 | for(int offset = 1; offset < NumSegments; offset *= 2) 235 | scan = shfl_add(scan, offset, NumSegments); 236 | storage.shared[tid] = scan - y; 237 | if(NumSegments - 1 == tid) storage.shared[NumSegments] = scan; 238 | } 239 | __syncthreads(); 240 | 241 | // Add the scanned partials back in and convert to exclusive scan. 242 | scan += storage.shared[segment]; 243 | if(MgpuScanTypeExc == type) { 244 | scan -= x; 245 | if(identity && !tid) scan = identity; 246 | } 247 | *total = storage.shared[NumSegments]; 248 | __syncthreads(); 249 | 250 | return scan; 251 | } 252 | MGPU_DEVICE static int Scan(int tid, int x, Storage& storage) { 253 | int total; 254 | return Scan(tid, x, storage, &total, MgpuScanTypeExc, 0); 255 | } 256 | }; 257 | 258 | #endif // __CUDA_ARCH__ >= 300 259 | 260 | //////////////////////////////////////////////////////////////////////////////// 261 | // CTABinaryScan 262 | 263 | template 264 | MGPU_DEVICE int CTABinaryScan(int tid, bool x, int* shared, int* total) { 265 | const int NumWarps = NT / WARP_SIZE; 266 | int warp = tid / WARP_SIZE; 267 | int lane = (WARP_SIZE - 1); 268 | 269 | // Store the bit totals for each warp. 270 | uint bits = __ballot(x); 271 | shared[warp] = popc(bits); 272 | __syncthreads(); 273 | 274 | #if __CUDA_ARCH__ >= 300 275 | if(tid < NumWarps) { 276 | int x = shared[tid]; 277 | int scan = x; 278 | #pragma unroll 279 | for(int offset = 1; offset < NumWarps; offset *= 2) 280 | scan = shfl_add(scan, offset, NumWarps); 281 | shared[tid] = scan - x; 282 | } 283 | __syncthreads(); 284 | 285 | #else 286 | // Thread 0 scans warp totals. 287 | if(!tid) { 288 | int scan = 0; 289 | #pragma unroll 290 | for(int i = 0; i < NumWarps; ++i) { 291 | int y = shared[i]; 292 | shared[i] = scan; 293 | scan += y; 294 | } 295 | shared[NumWarps] = scan; 296 | } 297 | __syncthreads(); 298 | 299 | #endif // __CUDA_ARCH__ >= 300 300 | 301 | // Add the warp scan back into the partials. 302 | int scan = shared[warp] + __popc(bfe(bits, 0, lane)); 303 | *total = shared[NumWarps]; 304 | __syncthreads(); 305 | return scan; 306 | } 307 | 308 | } // namespace mgpu 309 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/include/contrib/moderngpu/include/device/ctasearch.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "deviceutil.cuh" 38 | #include "../mgpudevice.cuh" 39 | 40 | namespace mgpu { 41 | 42 | template 44 | MGPU_HOST_DEVICE void BinarySearchIt(It data, int& begin, int& end, T key, 45 | int shift, Comp comp) { 46 | 47 | IntT scale = (1<< shift) - 1; 48 | int mid = (int)((begin + scale * end)>> shift); 49 | 50 | T key2 = data[mid]; 51 | bool pred = (MgpuBoundsUpper == Bounds) ? 52 | !comp(key, key2) : 53 | comp(key2, key); 54 | if(pred) begin = mid + 1; 55 | else end = mid; 56 | } 57 | 58 | template 60 | MGPU_HOST_DEVICE int BiasedBinarySearch(It data, int count, T key, int levels, 61 | Comp comp) { 62 | 63 | int begin = 0; 64 | int end = count; 65 | 66 | if(levels >= 4 && begin < end) 67 | BinarySearchIt(data, begin, end, key, 9, comp); 68 | if(levels >= 3 && begin < end) 69 | BinarySearchIt(data, begin, end, key, 7, comp); 70 | if(levels >= 2 && begin < end) 71 | BinarySearchIt(data, begin, end, key, 5, comp); 72 | if(levels >= 1 && begin < end) 73 | BinarySearchIt(data, begin, end, key, 4, comp); 74 | 75 | while(begin < end) 76 | BinarySearchIt(data, begin, end, key, 1, comp); 77 | return begin; 78 | } 79 | 80 | template 81 | MGPU_HOST_DEVICE int BinarySearch(It data, int count, T key, Comp comp) { 82 | int begin = 0; 83 | int end = count; 84 | while(begin < end) 85 | BinarySearchIt(data, begin, end, key, 1, comp); 86 | return begin; 87 | } 88 | 89 | //////////////////////////////////////////////////////////////////////////////// 90 | // MergePath search 91 | 92 | template 93 | MGPU_HOST_DEVICE int MergePath(It1 a, int aCount, It2 b, int bCount, int diag, 94 | Comp comp) { 95 | 96 | typedef typename std::iterator_traits::value_type T; 97 | int begin = max(0, diag - bCount); 98 | int end = min(diag, aCount); 99 | 100 | while(begin < end) { 101 | int mid = (begin + end)>> 1; 102 | T aKey = a[mid]; 103 | T bKey = b[diag - 1 - mid]; 104 | bool pred = (MgpuBoundsUpper == Bounds) ? 105 | comp(aKey, bKey) : 106 | !comp(bKey, aKey); 107 | if(pred) begin = mid + 1; 108 | else end = mid; 109 | } 110 | return begin; 111 | } 112 | 113 | 114 | //////////////////////////////////////////////////////////////////////////////// 115 | // SegmentedMergePath search 116 | 117 | template 118 | MGPU_HOST_DEVICE int SegmentedMergePath(InputIt keys, int aOffset, int aCount, 119 | int bOffset, int bCount, int leftEnd, int rightStart, int diag, Comp comp) { 120 | 121 | // leftEnd and rightStart are defined from the origin, and diag is defined 122 | // from aOffset. 123 | // We only need to run a Merge Path search if the diagonal intersects the 124 | // segment that strides the left and right halves (i.e. is between leftEnd 125 | // and rightStart). 126 | if(aOffset + diag <= leftEnd) return diag; 127 | if(aOffset + diag >= rightStart) return aCount; 128 | 129 | bCount = min(bCount, rightStart - bOffset); 130 | int begin = max(max(leftEnd - aOffset, 0), diag - bCount); 131 | int end = min(diag, aCount); 132 | 133 | while(begin < end) { 134 | int mid = (begin + end)>> 1; 135 | int ai = aOffset + mid; 136 | int bi = bOffset + diag - 1 - mid; 137 | 138 | bool pred = !comp(keys[bi], keys[ai]); 139 | if(pred) begin = mid + 1; 140 | else end = mid; 141 | } 142 | return begin; 143 | } 144 | 145 | //////////////////////////////////////////////////////////////////////////////// 146 | // BalancedPath search 147 | 148 | template 150 | MGPU_HOST_DEVICE int2 BalancedPath(InputIt1 a, int aCount, InputIt2 b, 151 | int bCount, int diag, int levels, Comp comp) { 152 | 153 | typedef typename std::iterator_traits::value_type T; 154 | 155 | int p = MergePath(a, aCount, b, bCount, diag, comp); 156 | int aIndex = p; 157 | int bIndex = diag - p; 158 | 159 | bool star = false; 160 | if(bIndex < bCount) { 161 | if(Duplicates) { 162 | T x = b[bIndex]; 163 | 164 | // Search for the beginning of the duplicate run in both A and B. 165 | // Because 166 | int aStart = BiasedBinarySearch(a, aIndex, x, 167 | levels, comp); 168 | int bStart = BiasedBinarySearch(b, bIndex, x, 169 | levels, comp); 170 | 171 | // The distance between the merge path and the lower_bound is the 172 | // 'run'. We add up the a- and b- runs and evenly distribute them to 173 | // get a stairstep path. 174 | int aRun = aIndex - aStart; 175 | int bRun = bIndex - bStart; 176 | int xCount = aRun + bRun; 177 | 178 | // Attempt to advance b and regress a. 179 | int bAdvance = max(xCount>> 1, bRun); 180 | int bEnd = min(bCount, bStart + bAdvance + 1); 181 | int bRunEnd = BinarySearch(b + bIndex, 182 | bEnd - bIndex, x, comp) + bIndex; 183 | bRun = bRunEnd - bStart; 184 | 185 | bAdvance = min(bAdvance, bRun); 186 | int aAdvance = xCount - bAdvance; 187 | 188 | bool roundUp = (aAdvance == bAdvance + 1) && (bAdvance < bRun); 189 | aIndex = aStart + aAdvance; 190 | 191 | if(roundUp) star = true; 192 | } else { 193 | if(aIndex && aCount) { 194 | T aKey = a[aIndex - 1]; 195 | T bKey = b[bIndex]; 196 | 197 | // If the last consumed element in A (aIndex - 1) is the same as 198 | // the next element in B (bIndex), we're sitting at a starred 199 | // partition. 200 | if(!comp(aKey, bKey)) star = true; 201 | } 202 | } 203 | } 204 | return make_int2(aIndex, star); 205 | } 206 | 207 | } // namespace mgpu 208 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/include/contrib/moderngpu/include/device/ctasegreduce.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "ctasegscan.cuh" 38 | #include "ctasearch.cuh" 39 | 40 | namespace mgpu { 41 | 42 | //////////////////////////////////////////////////////////////////////////////// 43 | // Segmented reduce utility functions. 44 | 45 | // Extract the upper-bound indices from the coded ranges. Decrement to include 46 | // the first addressed row/segment. 47 | 48 | struct SegReduceRange { 49 | int begin; 50 | int end; 51 | int total; 52 | bool flushLast; 53 | }; 54 | 55 | MGPU_DEVICE SegReduceRange DeviceShiftRange(int limit0, int limit1) { 56 | SegReduceRange range; 57 | range.begin = 0x7fffffff & limit0; 58 | range.end = 0x7fffffff & limit1; 59 | range.total = range.end - range.begin; 60 | range.flushLast = 0 == (0x80000000 & limit1); 61 | range.end += !range.flushLast; 62 | return range; 63 | } 64 | 65 | // Reconstitute row/segment indices from a starting row index and packed end 66 | // flags. Used for pre-processed versions of interval reduce and interval Spmv. 67 | template 68 | MGPU_DEVICE void DeviceExpandFlagsToRows(int first, int endFlags, 69 | int rows[VT + 1]) { 70 | 71 | rows[0] = first; 72 | #pragma unroll 73 | for(int i = 0; i < VT; ++i) { 74 | if((1<< i) & endFlags) ++first; 75 | rows[i + 1] = first; 76 | } 77 | } 78 | 79 | //////////////////////////////////////////////////////////////////////////////// 80 | // After loading CSR terms into shared memory, each thread binary searches 81 | // (upper-bound) to find its starting point. Each thread then walks forward, 82 | // emitting the csr0-relative row indices to register. 83 | 84 | template 85 | MGPU_DEVICE int DeviceExpandCsrRows(int tidOffset, int* csr_shared, 86 | int numRows, int end, int rows[VT + 1], int rowStarts[VT]) { 87 | 88 | // Each thread binary searches for its starting row. 89 | int row = BinarySearch(csr_shared, numRows, tidOffset, 90 | mgpu::less()) - 1; 91 | 92 | // Each thread starts at row and scans forward, emitting row IDs into 93 | // register. Store the CTA-local row index (starts at 0) to rows and the 94 | // start of the row (globally) to rowStarts. 95 | int curOffset = csr_shared[row]; 96 | int nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end; 97 | 98 | rows[0] = row; 99 | rowStarts[0] = curOffset; 100 | int endFlags = 0; 101 | 102 | #pragma unroll 103 | for(int i = 1; i <= VT; ++i) { 104 | // Advance the row cursor when the iterator hits the next row offset. 105 | if(tidOffset + i == nextOffset) { 106 | // Set an end flag when the cursor advances to the next row. 107 | endFlags |= 1<< (i - 1); 108 | 109 | // Advance the cursor and load the next row offset. 110 | ++row; 111 | curOffset = nextOffset; 112 | nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end; 113 | } 114 | rows[i] = row; 115 | if(i < VT) rowStarts[i] = curOffset; 116 | } 117 | __syncthreads(); 118 | 119 | return endFlags; 120 | } 121 | 122 | //////////////////////////////////////////////////////////////////////////////// 123 | // DeviceSegReducePrepare 124 | // Expand non-empty interval of CSR elements into row indices. Compute end-flags 125 | // by comparing adjacent row IDs. 126 | 127 | // DeviceSegReducePrepare may be called either by a pre-processing kernel or by 128 | // the kernel that actually evaluates the segmented reduction if no preprocesing 129 | // is desired. 130 | struct SegReduceTerms { 131 | int endFlags; 132 | int tidDelta; 133 | }; 134 | 135 | template 136 | MGPU_DEVICE SegReduceTerms DeviceSegReducePrepare(int* csr_shared, int numRows, 137 | int tid, int gid, bool flushLast, int rows[VT + 1], int rowStarts[VT]) { 138 | 139 | // Pass a sentinel (end) to point to the next segment start. If we flush, 140 | // this is the end of this tile. Otherwise it is INT_MAX 141 | int endFlags = DeviceExpandCsrRows(gid + VT * tid, csr_shared, 142 | numRows, flushLast ? (gid + NT * VT) : INT_MAX, rows, rowStarts); 143 | 144 | // Find the distance to to scan to compute carry-in for each thread. Use the 145 | // existance of an end flag anywhere in the thread to determine if carry-out 146 | // values from the left should propagate through to the right. 147 | int tidDelta = DeviceFindSegScanDelta(tid, rows[0] != rows[VT], 148 | csr_shared); 149 | 150 | SegReduceTerms terms = { endFlags, tidDelta }; 151 | return terms; 152 | } 153 | 154 | //////////////////////////////////////////////////////////////////////////////// 155 | // CTASegReduce 156 | // Core segmented reduction code. Supports fast-path and slow-path for intra-CTA 157 | // segmented reduction. Stores partials to global memory. 158 | // Callers feed CTASegReduce::ReduceToGlobal values in thread order. 159 | template 160 | struct CTASegReduce { 161 | typedef CTASegScan SegScan; 162 | 163 | enum { 164 | NV = NT * VT, 165 | Capacity = HalfCapacity ? (NV / 2) : NV 166 | }; 167 | 168 | union Storage { 169 | typename SegScan::Storage segScanStorage; 170 | T values[Capacity]; 171 | }; 172 | 173 | template 174 | MGPU_DEVICE static void ReduceToGlobal(const int rows[VT + 1], int total, 175 | int tidDelta, int startRow, int block, int tid, T data[VT], 176 | DestIt dest_global, T* carryOut_global, T identity, Op op, 177 | Storage& storage) { 178 | 179 | // Run a segmented scan within the thread. 180 | T x, localScan[VT]; 181 | #pragma unroll 182 | for(int i = 0; i < VT; ++i) { 183 | x = i ? op(x, data[i]) : data[i]; 184 | localScan[i] = x; 185 | if(rows[i] != rows[i + 1]) x = identity; 186 | } 187 | 188 | // Run a parallel segmented scan over the carry-out values to compute 189 | // carry-in. 190 | T carryOut; 191 | T carryIn = SegScan::SegScanDelta(tid, tidDelta, x, 192 | storage.segScanStorage, &carryOut, identity, op); 193 | 194 | // Store the carry-out for the entire CTA to global memory. 195 | if(!tid) carryOut_global[block] = carryOut; 196 | 197 | dest_global += startRow; 198 | if(HalfCapacity && total > Capacity) { 199 | // Add carry-in to each thread-local scan value. Store directly 200 | // to global. 201 | #pragma unroll 202 | for(int i = 0; i < VT; ++i) { 203 | // Add the carry-in to the local scan. 204 | T x2 = op(carryIn, localScan[i]); 205 | 206 | // Store on the end flag and clear the carry-in. 207 | if(rows[i] != rows[i + 1]) { 208 | carryIn = identity; 209 | dest_global[rows[i]] = x2; 210 | } 211 | } 212 | } else { 213 | // All partials fit in shared memory. Add carry-in to each thread- 214 | // local scan value. 215 | #pragma unroll 216 | for(int i = 0; i < VT; ++i) { 217 | // Add the carry-in to the local scan. 218 | T x2 = op(carryIn, localScan[i]); 219 | 220 | // Store reduction when the segment changes and clear the 221 | // carry-in. 222 | if(rows[i] != rows[i + 1]) { 223 | storage.values[rows[i]] = x2; 224 | carryIn = identity; 225 | } 226 | } 227 | __syncthreads(); 228 | 229 | // Cooperatively store reductions to global memory. 230 | for(int index = tid; index < total; index += NT) 231 | dest_global[index] = storage.values[index]; 232 | __syncthreads(); 233 | } 234 | } 235 | }; 236 | 237 | } // namespace mgpu 238 | 239 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/include/contrib/moderngpu/include/device/ctasegscan.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "ctascan.cuh" 38 | 39 | namespace mgpu { 40 | 41 | //////////////////////////////////////////////////////////////////////////////// 42 | // DeviceFindSegScanDelta 43 | // Runs an inclusive max-index scan over binary inputs. 44 | 45 | template 46 | MGPU_DEVICE int DeviceFindSegScanDelta(int tid, bool flag, int* delta_shared) { 47 | const int NumWarps = NT / 32; 48 | 49 | int warp = tid / 32; 50 | int lane = 31 & tid; 51 | uint warpMask = 0xffffffff>> (31 - lane); // inclusive search 52 | uint ctaMask = 0x7fffffff>> (31 - lane); // exclusive search 53 | 54 | uint warpBits = __ballot(flag); 55 | delta_shared[warp] = warpBits; 56 | __syncthreads(); 57 | 58 | if(tid < NumWarps) { 59 | uint ctaBits = __ballot(0 != delta_shared[tid]); 60 | int warpSegment = 31 - clz(ctaMask & ctaBits); 61 | int start = (-1 != warpSegment) ? 62 | (31 - clz(delta_shared[warpSegment]) + 32 * warpSegment) : 0; 63 | delta_shared[NumWarps + tid] = start; 64 | } 65 | __syncthreads(); 66 | 67 | // Find the closest flag to the left of this thread within the warp. 68 | // Include the flag for this thread. 69 | int start = 31 - clz(warpMask & warpBits); 70 | if(-1 != start) start += ~31 & tid; 71 | else start = delta_shared[NumWarps + warp]; 72 | __syncthreads(); 73 | 74 | return tid - start; 75 | } 76 | 77 | //////////////////////////////////////////////////////////////////////////////// 78 | // CTASegScan 79 | 80 | template > 81 | struct CTASegScan { 82 | typedef _Op Op; 83 | typedef typename Op::result_type T; 84 | enum { NumWarps = NT / 32, Size = NT, Capacity = 2 * NT }; 85 | union Storage { 86 | int delta[NumWarps]; 87 | T values[Capacity]; 88 | }; 89 | 90 | // Each thread passes the reduction of the LAST SEGMENT that it covers. 91 | // flag is set to true if there's at least one segment flag in the thread. 92 | // SegScan returns the reduction of values for the first segment in this 93 | // thread over the preceding threads. 94 | // Return the value init for the first thread. 95 | 96 | // When scanning single elements per thread, interpret the flag as a BEGIN 97 | // FLAG. If tid's flag is set, its value belongs to thread tid + 1, not 98 | // thread tid. 99 | 100 | // The function returns the reduction of the last segment in the CTA. 101 | 102 | MGPU_DEVICE static T SegScanDelta(int tid, int tidDelta, T x, 103 | Storage& storage, T* carryOut, T identity = (T)0, Op op = Op()) { 104 | 105 | // Run an inclusive scan 106 | int first = 0; 107 | storage.values[first + tid] = x; 108 | __syncthreads(); 109 | 110 | #pragma unroll 111 | for(int offset = 1; offset < NT; offset += offset) { 112 | if(tidDelta >= offset) 113 | x = op(storage.values[first + tid - offset], x); 114 | first = NT - first; 115 | storage.values[first + tid] = x; 116 | __syncthreads(); 117 | } 118 | 119 | // Get the exclusive scan. 120 | x = tid ? storage.values[first + tid - 1] : identity; 121 | *carryOut = storage.values[first + NT - 1]; 122 | __syncthreads(); 123 | return x; 124 | } 125 | 126 | MGPU_DEVICE static T SegScan(int tid, T x, bool flag, Storage& storage, 127 | T* carryOut, T identity = (T)0, Op op = Op()) { 128 | 129 | // Find the left-most thread that covers the first segment of this 130 | // thread. 131 | int tidDelta = DeviceFindSegScanDelta(tid, flag, storage.delta); 132 | 133 | return SegScanDelta(tid, tidDelta, x, storage, carryOut, identity, op); 134 | } 135 | }; 136 | 137 | } // namespace mgpu 138 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/include/contrib/moderngpu/include/device/ctasortedsearch.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "../mgpudevice.cuh" 38 | #include "ctasearch.cuh" 39 | 40 | namespace mgpu { 41 | 42 | 43 | //////////////////////////////////////////////////////////////////////////////// 44 | // DeviceSerialSearch 45 | 46 | template 48 | MGPU_DEVICE int3 DeviceSerialSearch(const T* keys_shared, int aBegin, 49 | int aEnd, int bBegin, int bEnd, int aOffset, int bOffset, int* indices, 50 | Comp comp) { 51 | 52 | const int FlagA = IndexA ? 0x80000000 : 1; 53 | const int FlagB = IndexB ? 0x80000000 : 1; 54 | 55 | T aKey = keys_shared[aBegin]; 56 | T bKey = keys_shared[bBegin]; 57 | T aPrev, bPrev; 58 | if(aBegin > 0) aPrev = keys_shared[aBegin - 1]; 59 | if(bBegin > 0) bPrev = keys_shared[bBegin - 1]; 60 | int decisions = 0; 61 | int matchCountA = 0; 62 | int matchCountB = 0; 63 | 64 | #pragma unroll 65 | for(int i = 0; i < VT; ++i) { 66 | bool p; 67 | if(RangeCheck && aBegin >= aEnd) p = false; 68 | else if(RangeCheck && bBegin >= bEnd) p = true; 69 | else p = (MgpuBoundsUpper == Bounds) ? 70 | comp(aKey, bKey) : 71 | !comp(bKey, aKey); 72 | 73 | if(p) { 74 | // aKey is smaller than bKey, so it is inserted before bKey. 75 | // Save bKey's index (bBegin + first) as the result of the search 76 | // and advance to the next needle in A. 77 | bool match = false; 78 | if(MatchA) { 79 | // Test if there is an element in B that matches aKey. 80 | if(MgpuBoundsUpper == Bounds) { 81 | // Upper Bound: We're inserting aKey after bKey. If there 82 | // is a match for aKey it must be bPrev. Check that bPrev 83 | // is in range and equal to aKey. 84 | // The predicate test result !comp(aKey, bPrev) was 85 | // established on the previous A-advancing iteration (it 86 | // failed the comp(aKey, bKey) test to get us to this 87 | // point). Check the other half of the equality condition 88 | // with a second comparison. 89 | bool inRange = !RangeCheck || (bBegin > aEnd); 90 | match = inRange && !comp(bPrev, aKey); 91 | } else { 92 | // Lower Bound: We're inserting aKey before bKey. If there 93 | // is a match for aKey, it must be bKey. Check that bKey 94 | // is in range and equal to aKey. 95 | // The predicate test !comp(bKey, aKey) has established one 96 | // half of the equality condition. We establish the other 97 | // half with a second comparison. 98 | bool inRange = !RangeCheck || (bBegin < bEnd); 99 | match = inRange && !comp(aKey, bKey); 100 | } 101 | } 102 | 103 | int index = 0; 104 | if(IndexA) index = bOffset + bBegin; 105 | if(match) index |= FlagA; 106 | if(IndexA || MatchA) indices[i] = index; 107 | matchCountA += match; 108 | 109 | // Mark the decision bit to indicate that this iteration has 110 | // progressed A (the needles). 111 | decisions |= 1<< i; 112 | aPrev = aKey; 113 | aKey = keys_shared[++aBegin]; 114 | } else { 115 | // aKey is larger than bKey, so it is inserted after bKey (but we 116 | // don't know where yet). Advance the B index to the next element in 117 | // the haystack to continue the search for the current needle. 118 | bool match = false; 119 | if(MatchB) { 120 | if(MgpuBoundsUpper == Bounds) { 121 | // Upper Bound: aKey is not smaller than bKey. We advance to 122 | // the next haystack element in B. If there is a match in A 123 | // for bKey it must be aKey. By entering this branch we've 124 | // verified that !comp(aKey, bKey). Making the reciprocal 125 | // comparison !comp(bKey, aKey) establishes aKey == bKey. 126 | bool inRange = !RangeCheck || 127 | ((bBegin < bEnd) && (aBegin < aEnd)); 128 | match = inRange && !comp(bKey, aKey); 129 | } else { 130 | // Lower Bound: bKey is smaller than aKey. We advance to the 131 | // next element in B. If there is a match for bKey, it must 132 | // be aPrev. The previous A-advancing iteration proved that 133 | // !comp(bKey, aPrev). We test !comp(aPrev, bKey) for the 134 | // other half of the equality condition. 135 | bool inRange = !RangeCheck || 136 | ((bBegin < bEnd) && (aBegin > 0)); 137 | match = inRange && !comp(aPrev, bKey); 138 | } 139 | } 140 | 141 | int index = 0; 142 | if(IndexB) index = aOffset + aBegin; 143 | if(match) index |= FlagB; 144 | if(IndexB || MatchB) indices[i] = index; 145 | matchCountB += match; 146 | 147 | // Keep the decision bit cleared to indicate that this iteration 148 | // has progressed B (the haystack). 149 | bPrev = bKey; 150 | bKey = keys_shared[++bBegin]; 151 | } 152 | } 153 | return make_int3(decisions, matchCountA, matchCountB); 154 | } 155 | 156 | //////////////////////////////////////////////////////////////////////////////// 157 | // CTASortedSearch 158 | // Take keys in shared memory and return indices and b-match flags in shared 159 | // memory. 160 | // NOTE: This function doesn't do any strided-to-thread order transposes so 161 | // using an even number of values per thread will incur no additional bank 162 | // conflicts. 163 | 164 | template 166 | MGPU_DEVICE int2 CTASortedSearch(T* keys_shared, int aStart, int aCount, 167 | int aEnd, int a0, int bStart, int bCount, int bEnd, int b0, bool extended, 168 | int tid, int* indices_shared, Comp comp) { 169 | 170 | // Run a merge path to find the start of the serial search for each thread. 171 | int diag = VT * tid; 172 | int mp = MergePath(keys_shared + aStart, aCount, 173 | keys_shared + bStart, bCount, diag, comp); 174 | int a0tid = mp; 175 | int b0tid = diag - mp; 176 | 177 | // Serial search into register. 178 | int3 results; 179 | int indices[VT]; 180 | if(extended) 181 | results = DeviceSerialSearch(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd, 183 | a0 - aStart, b0 - bStart, indices, comp); 184 | else 185 | results = DeviceSerialSearch(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd, 187 | a0 - aStart, b0 - bStart, indices, comp); 188 | __syncthreads(); 189 | 190 | // Compact the indices into shared memory. Use the decision bits (set is A, 191 | // cleared is B) to select the destination. 192 | int decisions = results.x; 193 | b0tid += aCount; 194 | #pragma unroll 195 | for(int i = 0; i < VT; ++i) { 196 | if((1<< i) & decisions) { 197 | if(IndexA || MatchA) indices_shared[a0tid++] = indices[i]; 198 | } else { 199 | if(IndexB || MatchB) indices_shared[b0tid++] = indices[i]; 200 | } 201 | } 202 | __syncthreads(); 203 | 204 | // Return the match counts for A and B keys. 205 | return make_int2(results.y, results.z); 206 | } 207 | 208 | } // namespace mgpu 209 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/include/contrib/moderngpu/include/device/deviceutil.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "intrinsics.cuh" 38 | 39 | namespace mgpu { 40 | 41 | // Get the difference between two pointers in bytes. 42 | MGPU_HOST_DEVICE ptrdiff_t PtrDiff(const void* a, const void* b) { 43 | return (const byte*)b - (const byte*)a; 44 | } 45 | 46 | // Offset a pointer by i bytes. 47 | template 48 | MGPU_HOST_DEVICE const T* PtrOffset(const T* p, ptrdiff_t i) { 49 | return (const T*)((const byte*)p + i); 50 | } 51 | template 52 | MGPU_HOST_DEVICE T* PtrOffset(T* p, ptrdiff_t i) { 53 | return (T*)((byte*)p + i); 54 | } 55 | 56 | //////////////////////////////////////////////////////////////////////////////// 57 | // Task range support 58 | // Evenly distributes variable-length arrays over a fixed number of CTAs. 59 | 60 | MGPU_HOST int2 DivideTaskRange(int numItems, int numWorkers) { 61 | div_t d = div(numItems, numWorkers); 62 | return make_int2(d.quot, d.rem); 63 | } 64 | 65 | MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task) { 66 | int2 range; 67 | range.x = task.x * block; 68 | range.x += min(block, task.y); 69 | range.y = range.x + task.x + (block < task.y); 70 | return range; 71 | } 72 | 73 | MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task, int blockSize, 74 | int count) { 75 | int2 range = ComputeTaskRange(block, task); 76 | range.x *= blockSize; 77 | range.y = min(count, range.y * blockSize); 78 | return range; 79 | } 80 | 81 | //////////////////////////////////////////////////////////////////////////////// 82 | // DeviceExtractHeadFlags 83 | // Input array flags is a bit array with 32 head flags per word. 84 | // ExtractThreadHeadFlags returns numBits flags starting at bit index. 85 | 86 | MGPU_HOST_DEVICE uint DeviceExtractHeadFlags(const uint* flags, int index, 87 | int numBits) { 88 | 89 | int index2 = index>> 5; 90 | int shift = 31 & index; 91 | uint headFlags = flags[index2]>> shift; 92 | int shifted = 32 - shift; 93 | 94 | if(shifted < numBits) 95 | // We also need to shift in the next set of bits. 96 | headFlags = bfi(flags[index2 + 1], headFlags, shifted, shift); 97 | headFlags &= (1<< numBits) - 1; 98 | return headFlags; 99 | } 100 | 101 | //////////////////////////////////////////////////////////////////////////////// 102 | // DevicePackHeadFlags 103 | // Pack VT bits per thread at 32 bits/thread. Will consume an integer number of 104 | // words, because CTA size is a multiple of 32. The first NT * VT / 32 threads 105 | // return packed words. 106 | 107 | template 108 | MGPU_DEVICE uint DevicePackHeadFlags(uint threadBits, int tid, 109 | uint* flags_shared) { 110 | 111 | const int WordCount = NT * VT / 32; 112 | 113 | // Each thread stores its thread bits to flags_shared[tid]. 114 | flags_shared[tid] = threadBits; 115 | __syncthreads(); 116 | 117 | uint packed = 0; 118 | if(tid < WordCount) { 119 | const int Items = MGPU_DIV_UP(32, VT); 120 | int index = 32 * tid; 121 | int first = index / VT; 122 | int bit = 0; 123 | 124 | int rem = index - VT * first; 125 | packed = flags_shared[first]>> rem; 126 | bit = VT - rem; 127 | ++first; 128 | 129 | #pragma unroll 130 | for(int i = 0; i < Items; ++i) { 131 | if(i < Items - 1 || bit < 32) { 132 | uint x = flags_shared[first + i]; 133 | if(bit < 32) packed |= x<< bit; 134 | bit += VT; 135 | } 136 | } 137 | } 138 | __syncthreads(); 139 | 140 | return packed; 141 | } 142 | 143 | } // namespace mgpu 144 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/include/contrib/moderngpu/include/device/serialsets.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "deviceutil.cuh" 38 | 39 | namespace mgpu { 40 | 41 | //////////////////////////////////////////////////////////////////////////////// 42 | // SerialSetIntersection 43 | // Emit A if A and B are in range and equal. 44 | 45 | template 46 | MGPU_DEVICE int SerialSetIntersection(const T* data, int aBegin, int aEnd, 47 | int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) { 48 | 49 | const int MinIterations = VT / 2; 50 | int commit = 0; 51 | 52 | #pragma unroll 53 | for(int i = 0; i < VT; ++i) { 54 | bool test = RangeCheck ? 55 | ((aBegin + bBegin < end) && (aBegin < aEnd) && (bBegin < bEnd)) : 56 | (i < MinIterations || (aBegin + bBegin < end)); 57 | 58 | if(test) { 59 | T aKey = data[aBegin]; 60 | T bKey = data[bBegin]; 61 | 62 | bool pA = comp(aKey, bKey); 63 | bool pB = comp(bKey, aKey); 64 | 65 | // The outputs must come from A by definition of set interection. 66 | results[i] = aKey; 67 | indices[i] = aBegin; 68 | 69 | if(!pB) ++aBegin; 70 | if(!pA) ++bBegin; 71 | if(pA == pB) commit |= 1<< i; 72 | } 73 | } 74 | return commit; 75 | } 76 | 77 | //////////////////////////////////////////////////////////////////////////////// 78 | // SerialSetUnion 79 | // Emit A if A <= B. Emit B if B < A. 80 | 81 | template 82 | MGPU_DEVICE int SerialSetUnion(const T* data, int aBegin, int aEnd, 83 | int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) { 84 | 85 | const int MinIterations = VT / 2; 86 | int commit = 0; 87 | 88 | #pragma unroll 89 | for(int i = 0; i < VT; ++i) { 90 | bool test = RangeCheck ? 91 | (aBegin + bBegin < end) : 92 | (i < MinIterations || (aBegin + bBegin < end)); 93 | 94 | if(test) { 95 | T aKey = data[aBegin]; 96 | T bKey = data[bBegin]; 97 | 98 | bool pA = false, pB = false; 99 | if(RangeCheck && aBegin >= aEnd) 100 | pB = true; 101 | else if(RangeCheck && bBegin >= bEnd) 102 | pA = true; 103 | else { 104 | // Both are in range. 105 | pA = comp(aKey, bKey); 106 | pB = comp(bKey, aKey); 107 | } 108 | 109 | // Output A in case of a tie, so check if b < a. 110 | results[i] = pB ? bKey : aKey; 111 | indices[i] = pB ? bBegin : aBegin; 112 | if(!pB) ++aBegin; 113 | if(!pA) ++bBegin; 114 | commit |= 1<< i; 115 | } 116 | } 117 | return commit; 118 | } 119 | 120 | //////////////////////////////////////////////////////////////////////////////// 121 | // SerialSetDifference 122 | // Emit A if A < B. 123 | 124 | template 125 | MGPU_DEVICE int SerialSetDifference(const T* data, int aBegin, int aEnd, 126 | int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) { 127 | 128 | const int MinIterations = VT / 2; 129 | int commit = 0; 130 | 131 | #pragma unroll 132 | for(int i = 0; i < VT; ++i) { 133 | bool test = RangeCheck ? 134 | (aBegin + bBegin < end) : 135 | (i < MinIterations || (aBegin + bBegin < end)); 136 | if(test) { 137 | T aKey = data[aBegin]; 138 | T bKey = data[bBegin]; 139 | 140 | bool pA = false, pB = false; 141 | if(RangeCheck && aBegin >= aEnd) 142 | pB = true; 143 | else if(RangeCheck && bBegin >= bEnd) 144 | pA = true; 145 | else { 146 | pA = comp(aKey, bKey); 147 | pB = comp(bKey, aKey); 148 | } 149 | 150 | // The outputs must come from A by definition of set difference. 151 | results[i] = aKey; 152 | indices[i] = aBegin; 153 | if(!pB) ++aBegin; 154 | if(!pA) ++bBegin; 155 | if(pA) commit |= 1<< i; 156 | } 157 | } 158 | return commit; 159 | } 160 | 161 | //////////////////////////////////////////////////////////////////////////////// 162 | // SerialSetSymDiff 163 | // Emit A if A < B and emit B if B < A. 164 | 165 | template 166 | MGPU_DEVICE int SerialSetSymDiff(const T* data, int aBegin, int aEnd, 167 | int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) { 168 | 169 | const int MinIterations = VT / 2; 170 | int commit = 0; 171 | 172 | #pragma unroll 173 | for(int i = 0; i < VT; ++i) { 174 | bool test = RangeCheck ? 175 | (aBegin + bBegin < end) : 176 | (i < MinIterations || (aBegin + bBegin < end)); 177 | if(test) { 178 | T aKey = data[aBegin]; 179 | T bKey = data[bBegin]; 180 | 181 | bool pA = false, pB = false; 182 | if(RangeCheck && (bBegin >= bEnd)) 183 | pA = true; 184 | else if(RangeCheck && (aBegin >= aEnd)) 185 | pB = true; 186 | else { 187 | pA = comp(aKey, bKey); 188 | pB = comp(bKey, aKey); 189 | } 190 | 191 | results[i] = pA ? aKey : bKey; 192 | indices[i] = pA ? aBegin : bBegin; 193 | if(!pA) ++bBegin; 194 | if(!pB) ++aBegin; 195 | if(pA != pB) commit |= 1<< i; 196 | } 197 | } 198 | return commit; 199 | } 200 | 201 | //////////////////////////////////////////////////////////////////////////////// 202 | // SerialSetOp 203 | // Uses the MgpuSetOp enum to statically select one of the four serial ops 204 | // above. 205 | 206 | template 207 | MGPU_DEVICE int SerialSetOp(const T* data, int aBegin, int aEnd, 208 | int bBegin, int bEnd, int star, T* results, int* indices, Comp comp) { 209 | 210 | int end = aBegin + bBegin + VT - star; 211 | if(RangeCheck) end = min(end, aEnd + bEnd); 212 | int commit; 213 | switch(Op) { 214 | case MgpuSetOpIntersection: 215 | commit = SerialSetIntersection(data, aBegin, 216 | aEnd, bBegin, bEnd, end, results, indices, comp); 217 | break; 218 | case MgpuSetOpUnion: 219 | commit = SerialSetUnion(data, aBegin, aEnd, 220 | bBegin, bEnd, end, results, indices, comp); 221 | break; 222 | case MgpuSetOpDiff: 223 | commit = SerialSetDifference(data, aBegin, aEnd, 224 | bBegin, bEnd, end, results, indices, comp); 225 | break; 226 | case MgpuSetOpSymDiff: 227 | commit = SerialSetSymDiff(data, aBegin, aEnd, 228 | bBegin, bEnd, end, results, indices, comp); 229 | break; 230 | } 231 | __syncthreads(); 232 | return commit; 233 | } 234 | 235 | } // namespace mgpu 236 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/include/contrib/moderngpu/include/device/sortnetwork.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "deviceutil.cuh" 38 | 39 | namespace mgpu { 40 | 41 | //////////////////////////////////////////////////////////////////////////////// 42 | // Odd-even transposition sorting network. Sorts keys and values in-place in 43 | // register. 44 | // http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort 45 | 46 | // CUDA Compiler does not currently unroll these loops correctly. Write using 47 | // template loop unrolling. 48 | /* 49 | template 50 | MGPU_DEVICE void OddEvenTransposeSort(T* keys, V* values, Comp comp) { 51 | #pragma unroll 52 | for(int level = 0; level < VT; ++level) { 53 | 54 | #pragma unroll 55 | for(int i = 1 & level; i < VT - 1; i += 2) { 56 | if(comp(keys[i + 1], keys[i])) { 57 | mgpu::swap(keys[i], keys[i + 1]); 58 | mgpu::swap(values[i], values[i + 1]); 59 | } 60 | } 61 | } 62 | }*/ 63 | 64 | template 65 | struct OddEvenTransposeSortT { 66 | // Sort segments marked by head flags. If the head flag between i and i + 1 67 | // is set (so that (2<< i) & flags is true), the values belong to different 68 | // segments and are not swapped. 69 | template 70 | static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) { 71 | #pragma unroll 72 | for(int i = 1 & I; i < VT - 1; i += 2) 73 | if((0 == ((2<< i) & flags)) && comp(keys[i + 1], keys[i])) { 74 | mgpu::swap(keys[i], keys[i + 1]); 75 | mgpu::swap(values[i], values[i + 1]); 76 | } 77 | OddEvenTransposeSortT::Sort(keys, values, flags, comp); 78 | } 79 | }; 80 | template struct OddEvenTransposeSortT { 81 | template 82 | static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) { } 83 | }; 84 | 85 | template 86 | MGPU_DEVICE void OddEvenTransposeSort(K* keys, V* values, Comp comp) { 87 | OddEvenTransposeSortT<0, VT>::Sort(keys, values, 0, comp); 88 | } 89 | template 90 | MGPU_DEVICE void OddEvenTransposeSortFlags(K* keys, V* values, int flags, 91 | Comp comp) { 92 | OddEvenTransposeSortT<0, VT>::Sort(keys, values, flags, comp); 93 | } 94 | 95 | //////////////////////////////////////////////////////////////////////////////// 96 | // Batcher Odd-Even Mergesort network 97 | // Unstable but executes much faster than the transposition sort. 98 | // http://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort 99 | 100 | template 101 | struct OddEvenMergesortT { 102 | template 103 | MGPU_DEVICE static void CompareAndSwap(K* keys, V* values, int flags, 104 | int a, int b, Comp comp) { 105 | if(b < Count) { 106 | // Mask the bits between a and b. Any head flags in this interval 107 | // means the keys are in different segments and must not be swapped. 108 | const int Mask = ((2<< b) - 1) ^ ((2<< a) - 1); 109 | if(!(Mask & flags) && comp(keys[b], keys[a])) { 110 | mgpu::swap(keys[b], keys[a]); 111 | mgpu::swap(values[b], values[a]); 112 | } 113 | } 114 | } 115 | 116 | template 117 | struct OddEvenMerge { 118 | template 119 | MGPU_DEVICE static void Merge(K* keys, V* values, int flags, 120 | Comp comp) { 121 | // Compare and swap 122 | const int M = 2 * R; 123 | OddEvenMerge::Merge(keys, values, flags, comp); 124 | OddEvenMerge::Merge(keys, values, flags, comp); 125 | 126 | #pragma unroll 127 | for(int i = Low2 + R; i + R < Low2 + Width; i += M) 128 | CompareAndSwap(keys, values, flags, i, i + R, comp); 129 | } 130 | }; 131 | template 132 | struct OddEvenMerge { 133 | template 134 | MGPU_DEVICE static void Merge(K* keys, V* values, int flags, 135 | Comp comp) { 136 | CompareAndSwap(keys, values, flags, Low2, Low2 + R, comp); 137 | } 138 | }; 139 | 140 | template 141 | MGPU_DEVICE static void Sort(K* keys, V* values, int flags, 142 | Comp comp) { 143 | 144 | const int M = Width / 2; 145 | OddEvenMergesortT::Sort(keys, values, flags, comp); 146 | OddEvenMergesortT::Sort(keys, values, flags, comp); 147 | OddEvenMerge<1, Low>::Merge(keys, values, flags, comp); 148 | } 149 | }; 150 | template struct OddEvenMergesortT<1, Low, Count> { 151 | template 152 | MGPU_DEVICE static void Sort(K* keys, V* values, int flags, 153 | Comp comp) { } 154 | }; 155 | 156 | template 157 | MGPU_DEVICE void OddEvenMergesort(K* keys, V* values, Comp comp) { 158 | const int Width = 1<< sLogPow2::value; 159 | OddEvenMergesortT::Sort(keys, values, 0, comp); 160 | } 161 | template 162 | MGPU_DEVICE void OddEvenMergesortFlags(K* keys, V* values, int flags, 163 | Comp comp) { 164 | const int Width = 1<< sLogPow2::value; 165 | OddEvenMergesortT::Sort(keys, values, flags, comp); 166 | } 167 | 168 | } // namespace mgpu 169 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/include/contrib/moderngpu/include/mgpudevice.cuh: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include "mgpuenums.h" 38 | #include "device/deviceutil.cuh" 39 | 40 | namespace mgpu { 41 | 42 | //////////////////////////////////////////////////////////////////////////////// 43 | // device/loadstore.cuh 44 | 45 | // For 0 <= i < VT: 46 | // index = NT * i + tid; 47 | // reg[i] = data[index]; 48 | // Synchronize after load. 49 | template 50 | MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg, 51 | bool sync = true); 52 | 53 | // For 0 <= i < VT: 54 | // index = NT * i + tid; 55 | // if(index < count) reg[i] = data[index]; 56 | // No synchronize after load. 57 | template 58 | MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid, 59 | T* reg, bool sync = false); 60 | 61 | template 62 | MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid, 63 | T* reg, T init, bool sync = false); 64 | 65 | // For 0 <= i < VT: 66 | // index = NT * i + tid; 67 | // if(index < count) reg[i] = data[index]; 68 | // No synchronize after load. 69 | template 70 | MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid, 71 | T* reg, bool sync = false); 72 | 73 | // For 0 <= i < VT: 74 | // index = NT * i + tid; 75 | // if(index < count) reg[i] = data[index]; 76 | // No synchronize after load. 77 | template 78 | MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid, 79 | T* reg, T init, bool sync = false); 80 | 81 | // For 0 <= i < VT: 82 | // index = NT * i + tid; 83 | // if(index < count) reg[i] = data[index]; 84 | // No synchronize after load. 85 | // No optimized code path for count < NV (smaller generated code). 86 | template 87 | MGPU_DEVICE void DeviceGlobalToRegLoop(int count, InputIt data, int tid, 88 | T* reg, bool sync = false); 89 | 90 | 91 | // For 0 <= i < VT: 92 | // index = VT * tid + i. 93 | // if(index < count) reg[i] = data[index]; 94 | // No synchronize after load. 95 | template 96 | MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid, 97 | T* reg); 98 | 99 | template 100 | MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid, 101 | T* reg, T init); 102 | 103 | // For 0 <= i < VT: 104 | // index = NT * i + tid; 105 | // if(index < count) data[index] = reg[i]; 106 | // Synchronize after load. 107 | template 108 | MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid, OutputIt dest, 109 | bool sync = true); 110 | 111 | // For 0 <= i < VT: 112 | // index = NT * i + tid; 113 | // if(index < count) data[index] = reg[i]; 114 | // No synchronize after load. 115 | template 116 | MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid, 117 | OutputIt dest, bool sync = false); 118 | 119 | // For 0 <= index < count: 120 | // dest[index] = source[index]; 121 | // This function is intended to replace DeviceGlobalToShared in cases where 122 | // count is much less than NT * VT. 123 | template 124 | MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid, 125 | OutputIt dest, bool sync = true); 126 | 127 | // For 0 <= index < count: 128 | // dest[index] = source[index]; 129 | // Synchronize after store. 130 | template 131 | MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid, 132 | OutputIt dest, bool sync = true); 133 | 134 | // For 0 <= index < count: 135 | // dest[index] = source[index]; 136 | // Synchronize after store. 137 | template 138 | MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid, 139 | T* dest, bool sync = true); 140 | 141 | template 142 | MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid, 143 | T* dest, bool sync = true); 144 | 145 | // For 0 <= index < count: 146 | // dest[index] = source[index]; 147 | // Synchronize after store. 148 | // No optimized code path for count < NV (smaller generated code). 149 | template 150 | MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid, 151 | T* dest, bool sync = true); 152 | 153 | template 154 | MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid, 155 | T* dest, T init, bool sync = true); 156 | 157 | template 158 | MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt source, 159 | int tid, T* dest, T init, bool sync = true); 160 | 161 | // For 0 <= index < count: 162 | // dest[index] = source[index]; 163 | // No synchronize. 164 | template 165 | MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid, 166 | OutputIt dest, bool sync = false); 167 | 168 | // Transponse VT elements in NT threads (x) into thread-order registers (y) 169 | // using only NT * VT / 2 elements of shared memory. 170 | template 171 | MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y); 172 | 173 | // For 0 <= i < VT: 174 | // index = NT * i + tid; 175 | // if(index < count) 176 | // gather = indices[index]; 177 | // reg[i] = data[gather]; 178 | // Synchronize after load. 179 | template 180 | MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT], 181 | int tid, T* reg, bool sync = true); 182 | 183 | template 184 | MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT], 185 | int tid, T* reg, T identity, bool sync = true); 186 | 187 | // For 0 <= i < VT: 188 | // index = NT * i + tid; 189 | // if(index < count) 190 | // scatter = indices[index]; 191 | // data[scatter] = reg[i]; 192 | // Synchronize after store. 193 | template 194 | MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid, 195 | int indices[VT], OutputIt data, bool sync = true); 196 | 197 | // For 0 <= i < VT: 198 | // shared[VT * tid + i] = threadReg[i]; 199 | // Synchronize after store. 200 | // Note this function moves data in THREAD ORDER. 201 | // (DeviceRegToShared moves data in STRIDED ORDER). 202 | template 203 | MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared, 204 | bool sync = true); 205 | 206 | // For 0 <= i < VT: 207 | // threadReg[i] = shared[VT * tid + i]; 208 | // Synchronize after load. 209 | // Note this function moves data in THREAD ORDER. 210 | // (DeviceSharedToReg moves data in STRIDED ORDER). 211 | template 212 | MGPU_DEVICE void DeviceSharedToThread(const T* shared, int tid, T* threadReg, 213 | bool sync = true); 214 | 215 | // For 0 <= index < aCount: 216 | // shared[index] = a_global[index]; 217 | // For 0 <= index < bCount: 218 | // shared[aCount + index] = b_global[index]; 219 | // VT0 is the lower-bound for predication-free execution: 220 | // If count >= NT * VT0, a predication-free branch is taken. 221 | // VT1 is the upper-bound for loads: 222 | // NT * VT1 must >= aCount + bCount. 223 | 224 | template 225 | MGPU_DEVICE void DeviceLoad2ToReg(const T* a_global, int aCount, 226 | const T* b_global, int bCount, int tid, T* reg, bool sync = false); 227 | 228 | template 229 | MGPU_DEVICE void DeviceLoad2ToShared(const T* a_global, int aCount, 230 | const T* b_global, int bCount, int tid, T* shared, bool sync = true); 231 | 232 | template 234 | MGPU_DEVICE void DeviceLoad2ToReg(InputIt1 a_global, int aCount, 235 | InputIt2 b_global, int bCount, int tid, T* reg, bool sync = false); 236 | 237 | template 239 | MGPU_DEVICE void DeviceLoad2ToShared(InputIt1 a_global, int aCount, 240 | InputIt2 b_global, int bCount, int tid, T* shared, bool sync = true); 241 | 242 | // For 0 <= i < VT 243 | // index = NT * i + tid; 244 | // if(index < count) 245 | // gather = indices_shared[index]; 246 | // dest_global[index] = data_global[gather]; 247 | // Synchronize after load. 248 | template 249 | MGPU_DEVICE void DeviceGatherGlobalToGlobal(int count, InputIt data_global, 250 | const int* indices_shared, int tid, OutputIt dest_global, 251 | bool sync = true); 252 | 253 | // For 0 <= i < VT 254 | // index = NT * i + tid 255 | // if(index < count) 256 | // gather = indices[index]; 257 | // if(gather < aCount) data = a_global[gather]; 258 | // else data = b_global[gather - aCount]; 259 | // dest_global[index] = data; 260 | // Synchronize after load. 261 | template 263 | MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, InputIt1 a_global, 264 | InputIt2 b_global, int bStart, const int* indices, int tid, 265 | T* reg, bool sync = false); 266 | 267 | template 269 | MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, InputIt1 a_global, 270 | InputIt2 b_global, int bStart, const int* indices_shared, int tid, 271 | OutputIt dest_global, bool sync = true); 272 | 273 | template 274 | MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, const T* a_global, 275 | const T* b_global, int bStart, const int* indices, int tid, 276 | T* reg, bool sync = false); 277 | 278 | template 279 | MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, const T* a_global, 280 | const T* b_global, int bStart, const int* indices_shared, int tid, 281 | OutputIt dest_global, bool sync = true); 282 | 283 | 284 | 285 | } // namespace mgpu 286 | 287 | 288 | #include "device/loadstore.cuh" 289 | #include "device/ctasegscan.cuh" 290 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/include/contrib/moderngpu/include/mgpuenums.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | namespace mgpu { 38 | 39 | enum MgpuBounds { 40 | MgpuBoundsLower, 41 | MgpuBoundsUpper 42 | }; 43 | 44 | enum MgpuScanType { 45 | MgpuScanTypeExc, 46 | MgpuScanTypeInc 47 | }; 48 | 49 | enum MgpuSearchType { 50 | MgpuSearchTypeNone, 51 | MgpuSearchTypeIndex, 52 | MgpuSearchTypeMatch, 53 | MgpuSearchTypeIndexMatch 54 | }; 55 | 56 | enum MgpuJoinKind { 57 | MgpuJoinKindInner, 58 | MgpuJoinKindLeft, 59 | MgpuJoinKindRight, 60 | MgpuJoinKindOuter 61 | }; 62 | 63 | enum MgpuSetOp { 64 | MgpuSetOpIntersection, 65 | MgpuSetOpUnion, 66 | MgpuSetOpDiff, 67 | MgpuSetOpSymDiff 68 | }; 69 | 70 | } // namespace mgpu 71 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/include/contrib/moderngpu/include/util/static.h: -------------------------------------------------------------------------------- 1 | /****************************************************************************** 2 | * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of the NVIDIA CORPORATION nor the 12 | * names of its contributors may be used to endorse or promote products 13 | * derived from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY 19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | ******************************************************************************/ 27 | 28 | /****************************************************************************** 29 | * 30 | * Code and text by Sean Baxter, NVIDIA Research 31 | * See http://nvlabs.github.io/moderngpu for repository and documentation. 32 | * 33 | ******************************************************************************/ 34 | 35 | #pragma once 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | 51 | #ifndef MGPU_MIN 52 | #define MGPU_MIN(x, y) (((x) <= (y)) ? (x) : (y)) 53 | #define MGPU_MAX(x, y) (((x) >= (y)) ? (x) : (y)) 54 | #define MGPU_MAX0(x) (((x) >= 0) ? (x) : 0) 55 | #define MGPU_ABS(x) (((x) >= 0) ? (x) : (-x)) 56 | 57 | #define MGPU_DIV_UP(x, y) (((x) + (y) - 1) / (y)) 58 | #define MGPU_DIV_ROUND(x, y) (((x) + (y) / 2) / (y)) 59 | #define MGPU_ROUND_UP(x, y) ((y) * MGPU_DIV_UP(x, y)) 60 | #define MGPU_SHIFT_DIV_UP(x, y) (((x) + ((1<< (y)) - 1))>> y) 61 | #define MGPU_ROUND_UP_POW2(x, y) (((x) + (y) - 1) & ~((y) - 1)) 62 | #define MGPU_ROUND_DOWN_POW2(x, y) ((x) & ~((y) - 1)) 63 | #define MGPU_IS_POW_2(x) (0 == ((x) & ((x) - 1))) 64 | 65 | #endif // MGPU_MIN 66 | 67 | namespace mgpu { 68 | 69 | 70 | typedef unsigned char byte; 71 | 72 | typedef unsigned int uint; 73 | typedef signed short int16; 74 | 75 | typedef unsigned short ushort; 76 | typedef unsigned short uint16; 77 | 78 | typedef long long int64; 79 | typedef unsigned long long uint64; 80 | 81 | // IsPow2::value is true if X is a power of 2. 82 | template struct sIsPow2 { 83 | enum { value = 0 == (X & (X - 1)) }; 84 | }; 85 | 86 | // Finds the base-2 logarithm of X. value is -1 if X is not a power of 2. 87 | template struct sLogPow2 { 88 | enum { extra = sIsPow2::value ? 0 : (roundUp ? 1 : 0) }; 89 | enum { inner = sLogPow2::inner + 1 }; 90 | enum { value = inner + extra }; 91 | }; 92 | template struct sLogPow2<0, roundUp> { 93 | enum { inner = 0 }; 94 | enum { value = 0 }; 95 | }; 96 | template struct sLogPow2<1, roundUp> { 97 | enum { inner = 0 }; 98 | enum { value = 0 }; 99 | }; 100 | 101 | template 102 | struct sDivUp { 103 | enum { value = (X + Y - 1) / Y }; 104 | }; 105 | 106 | template struct sDiv2RoundUp { 107 | enum { value = sDiv2RoundUp::value, levels - 1>::value }; 108 | }; 109 | template struct sDiv2RoundUp { 110 | enum { value = count }; 111 | }; 112 | 113 | template 114 | struct sDivSafe { 115 | enum { value = X / Y }; 116 | }; 117 | template 118 | struct sDivSafe { 119 | enum { value = 0 }; 120 | }; 121 | 122 | template 123 | struct sRoundUp { 124 | enum { rem = X % Y }; 125 | enum { value = X + (rem ? (Y - rem) : 0) }; 126 | }; 127 | 128 | template 129 | struct sRoundDown { 130 | enum { rem = X % Y }; 131 | enum { value = X - rem }; 132 | }; 133 | 134 | // IntegerDiv is a template for avoiding divisions by zero in template 135 | // evaluation. Templates always evaluate both b and c in an expression like 136 | // a ? b : c, and will error if either rhs contains an illegal expression, 137 | // even if the ternary is explictly designed to guard against that. 138 | template 139 | struct sIntegerDiv { 140 | enum { value = X / (Y ? Y : (X + 1)) }; 141 | }; 142 | 143 | template 144 | struct sMax { 145 | enum { value = (X >= Y) ? X : Y }; 146 | }; 147 | template 148 | struct sMin { 149 | enum { value = (X <= Y) ? X : Y }; 150 | }; 151 | 152 | template 153 | struct sAbs { 154 | enum { value = (X >= 0) ? X : -X }; 155 | }; 156 | 157 | 158 | // Finds the number of powers of 2 in the prime factorization of X. 159 | template struct sNumFactorsOf2 { 160 | enum { shifted = X >> 1 }; 161 | enum { value = 1 + sNumFactorsOf2::value }; 162 | }; 163 | template struct sNumFactorsOf2 { 164 | enum { value = 0 }; 165 | }; 166 | 167 | // Returns the divisor for a conflict-free transpose. 168 | template struct sBankConflictDivisor { 169 | enum { value = 170 | (1 & X) ? 0 : 171 | (sIsPow2::value ? NumBanks : 172 | (1<< sNumFactorsOf2::value)) }; 173 | enum { log_value = sLogPow2::value }; 174 | }; 175 | 176 | template struct sConflictFreeStorage { 177 | enum { count = NT * X }; 178 | enum { divisor = sBankConflictDivisor::value }; 179 | enum { padding = sDivSafe::value }; 180 | enum { value = count + padding }; 181 | }; 182 | 183 | } // namespace mgpu 184 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/include/ctc.h: -------------------------------------------------------------------------------- 1 | /** \file ctc.h 2 | * Contains a simple C interface to call fast CPU and GPU based computation 3 | * of the CTC loss. 4 | */ 5 | 6 | #pragma once 7 | 8 | #ifdef __cplusplus 9 | #include 10 | extern "C" { 11 | #endif 12 | 13 | //forward declare of CUDA typedef to avoid needing to pull in CUDA headers 14 | typedef struct CUstream_st* CUstream; 15 | 16 | typedef enum { 17 | CTC_STATUS_SUCCESS = 0, 18 | CTC_STATUS_MEMOPS_FAILED = 1, 19 | CTC_STATUS_INVALID_VALUE = 2, 20 | CTC_STATUS_EXECUTION_FAILED = 3, 21 | CTC_STATUS_UNKNOWN_ERROR = 4 22 | } ctcStatus_t; 23 | 24 | /** Returns a string containing a description of status that was passed in 25 | * \param[in] status identifies which string should be returned 26 | * \return C style string containing the text description 27 | * */ 28 | const char* ctcGetStatusString(ctcStatus_t status); 29 | 30 | typedef enum { 31 | CTC_CPU = 0, 32 | CTC_GPU = 1 33 | } ctcComputeLocation; 34 | 35 | /** Structure used to indicate where the ctc calculation should take place 36 | * and parameters associated with that place. 37 | * Cpu execution can specify the maximum number of threads that can be used 38 | * Gpu execution can specify which stream the kernels should be launched in. 39 | * */ 40 | struct ctcComputeInfo { 41 | ctcComputeLocation loc; 42 | union { 43 | unsigned int num_threads; 44 | CUstream stream; 45 | }; 46 | }; 47 | 48 | /** Compute the connectionist temporal classification loss between a sequence 49 | * of probabilities and a ground truth labeling. Optionally compute the 50 | * gradient with respect to the inputs. 51 | * \param [in] activations pointer to the activations in either CPU or GPU 52 | * addressable memory, depending on info. We assume a fixed 53 | * memory layout for this 3 dimensional tensor, which has dimension 54 | * (t, n, p), where t is the time index, n is the minibatch index, 55 | * and p indexes over probabilities of each symbol in the alphabet. 56 | * The memory layout is (t, n, p) in C order (slowest to fastest changing 57 | * index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest 58 | * changing index, aka column-major). We also assume strides are equal to 59 | * dimensions - there is no padding between dimensions. 60 | * More precisely, element (t, n, p), for a problem with mini_batch examples 61 | * in the mini batch, and alphabet_size symbols in the alphabet, is located at: 62 | * activations[(t * mini_batch + n) * alphabet_size + p] 63 | * \param [out] gradients if not NULL, then gradients are computed. Should be 64 | * allocated in the same memory space as probs and memory 65 | * ordering is identical. 66 | * \param [in] flat_labels Always in CPU memory. A concatenation 67 | * of all the labels for the minibatch. 68 | * \param [in] label_lengths Always in CPU memory. The length of each label 69 | * for each example in the minibatch. 70 | * \param [in] input_lengths Always in CPU memory. The number of time steps 71 | * for each sequence in the minibatch. 72 | * \param [in] alphabet_size The number of possible output symbols. There 73 | * should be this many probabilities for each time step. 74 | * \param [in] mini_batch How many examples in a minibatch. 75 | * \param [out] costs Always in CPU memory. The cost of each example in the 76 | * minibatch. 77 | * \param [in,out] workspace In same memory space as probs. Should be of 78 | * size requested by get_workspace_size. 79 | * \param [in] ctcComputeInfo describes whether or not the execution should 80 | * take place on the CPU or GPU, and by extension the location of 81 | * the probs and grads pointers. Can be used to set the 82 | * number of threads for cpu execution or the stream for gpu 83 | * execution. 84 | * 85 | * \return Status information 86 | * 87 | * */ 88 | ctcStatus_t compute_ctc_loss(const float* const activations, 89 | float* gradients, 90 | const int* const flat_labels, 91 | const int* const label_lengths, 92 | const int* const input_lengths, 93 | int alphabet_size, 94 | int minibatch, 95 | float *costs, 96 | void *workspace, 97 | ctcComputeInfo info); 98 | 99 | // Simple wrappers to enable neon support 100 | int compute_ctc_loss_cpu(const float* const activations, 101 | float* gradients, 102 | const int* const flat_labels, 103 | const int* const label_lengths, 104 | const int* const input_lengths, 105 | int alphabet_size, 106 | int minibatch, 107 | float *costs, 108 | int num_threads); 109 | 110 | #ifdef __CUDACC__ 111 | int get_workspace_size_gpu(const int* const label_lengths, 112 | const int* const input_lengths, 113 | int alphabet_size, int minibatch, 114 | CUstream stream); 115 | 116 | int compute_ctc_loss_gpu(const float* const activations, 117 | float* gradients, 118 | const int* const flat_labels, 119 | const int* const label_lengths, 120 | const int* const input_lengths, 121 | int alphabet_size, 122 | int minibatch, 123 | float *costs, 124 | void *workspace, 125 | CUstream stream); 126 | #endif 127 | 128 | 129 | /** For a given set of labels and minibatch size return the required workspace 130 | * size. This will need to be allocated in the same memory space as your 131 | * probabilities. 132 | * \param [in] label_lengths Always in CPU memory. The length of each label 133 | * for each example in the minibatch. 134 | * \param [in] input_lengths Always in CPU memory. The number of time steps 135 | * for each sequence in the minibatch. 136 | * \param [in] alphabet_size How many symbols in the alphabet or, equivalently, 137 | * the number of probabilities at each time step 138 | * \param [in] mini_batch How many examples in a minibatch. 139 | * \param [in] info struct describing the location (cpu/gpu) and associated 140 | * parameters of execution 141 | * \param [out] size_bytes is pointer to a scalar where the memory 142 | * requirement in bytes will be placed. This memory should be allocated 143 | * at the same place, CPU or GPU, that the probs are in 144 | * 145 | * \return Status information 146 | **/ 147 | ctcStatus_t get_workspace_size(const int* const label_lengths, 148 | const int* const input_lengths, 149 | int alphabet_size, int minibatch, 150 | ctcComputeInfo info, 151 | size_t* size_bytes); 152 | 153 | 154 | #ifdef __cplusplus 155 | } 156 | #endif 157 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/include/detail/ctc_helper.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "hostdevice.h" 8 | 9 | namespace ctc_helper { 10 | 11 | static const int BLANK = 0; 12 | static const float threshold = 1e-1; 13 | 14 | template 15 | HOSTDEVICE 16 | T neg_inf() { return -T(INFINITY); } 17 | 18 | inline int div_up(int x, int y) { 19 | return (x + y - 1) / y; 20 | } 21 | 22 | template struct maximum { 23 | HOSTDEVICE 24 | Res operator()(const Arg& x, const Arg& y) const { 25 | return x < y ? y : x; 26 | } 27 | }; 28 | 29 | template struct add { 30 | HOSTDEVICE 31 | Res operator()(const Arg& x, const Arg& y) const { 32 | return x + y; 33 | } 34 | }; 35 | 36 | template struct identity { 37 | HOSTDEVICE Res operator()(const Arg& x) const {return Res(x);} 38 | }; 39 | 40 | template struct negate { 41 | HOSTDEVICE Res operator()(const Arg& x) const {return Res(-x);} 42 | }; 43 | 44 | template struct exponential { 45 | HOSTDEVICE Res operator()(const Arg& x) const {return std::exp(x);} 46 | }; 47 | 48 | template 49 | struct log_plus { 50 | typedef Res result_type; 51 | HOSTDEVICE 52 | Res operator()(const Arg1& p1, const Arg2& p2) { 53 | if (p1 == neg_inf()) 54 | return p2; 55 | if (p2 == neg_inf()) 56 | return p1; 57 | Res result = log1p(exp(-fabs(p1 - p2))) + maximum()(p1, p2); 58 | return result; 59 | } 60 | }; 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/include/detail/hostdevice.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef __CUDACC__ 4 | #define HOSTDEVICE __host__ __device__ 5 | #else 6 | #define HOSTDEVICE 7 | #endif 8 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/include/detail/reduce.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | ctcStatus_t reduce_negate(const float* input, float* output, int rows, int cols, bool axis, cudaStream_t stream); 4 | ctcStatus_t reduce_exp(const float* input, float* output, int rows, int cols, bool axis, cudaStream_t stream); 5 | ctcStatus_t reduce_max(const float* input, float* output, int rows, int cols, bool axis, cudaStream_t stream); 6 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/python/__init__.py: -------------------------------------------------------------------------------- 1 | from ctc import cpu_ctc_np, cpu_ctc_th 2 | del ctc 3 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/python/ctc.py: -------------------------------------------------------------------------------- 1 | import platform 2 | import os 3 | import numpy as np 4 | import numpy.ctypeslib as npct 5 | import ctypes 6 | import ctypes.util 7 | 8 | import theano 9 | import theano.tensor as T 10 | from theano.gradient import grad_undefined 11 | 12 | if platform.system() == "Darwin": 13 | ext = "dylib" 14 | elif platform.system() == "Linux": 15 | ext = "so" 16 | else: 17 | raise Exception("Unsupported platform: {}".format(platform.system())) 18 | libwarpctc = npct.load_library(os.path.join(os.path.dirname(__file__), "../build/libwarpctc.{}".format(ext)), "") 19 | 20 | libwarpctc.cpu_ctc.restype = None 21 | libwarpctc.cpu_ctc.argtypes = [ 22 | npct.ndpointer(dtype=np.float32, ndim=3), 23 | npct.ndpointer(dtype=np.float32, ndim=3), 24 | npct.ndpointer(dtype=np.int32, ndim=1), 25 | npct.ndpointer(dtype=np.int32, ndim=1), 26 | npct.ndpointer(dtype=np.int32, ndim=1), 27 | ctypes.c_int, 28 | ctypes.c_int, 29 | npct.ndpointer(dtype=np.float32, ndim=1), 30 | ctypes.c_int] 31 | 32 | def cpu_ctc_np(acts, act_lens, labels, label_lens): 33 | """ 34 | acts: 3-d numpy float array, same as c++ bindings 35 | act_lens: 1-d int array of input length of each example 36 | labels: list of 1-d int array for each example in minibatch 37 | label_lens: 1-d int array of label length of each example 38 | """ 39 | # make sure correct types 40 | acts = np.array(acts, dtype=np.float32) 41 | act_lens = np.array(act_lens, dtype=np.int32) 42 | labels = np.array(labels, dtype=np.int32) 43 | label_lens = np.array(label_lens, dtype=np.int32) 44 | 45 | # C needs sizes 46 | alphabet_size = acts.shape[2] 47 | minibatch = acts.shape[1] 48 | 49 | # create return variables 50 | grads = np.zeros_like(acts, dtype=np.float32) 51 | cost = np.zeros((minibatch,), dtype=np.float32) 52 | 53 | # compute 54 | libwarpctc.cpu_ctc(acts, grads, labels, label_lens, act_lens, alphabet_size, minibatch, cost, 1) 55 | return cost, grads 56 | 57 | class CPUCTCGrad(theano.Op): 58 | # Properties attribute 59 | __props__ = () 60 | 61 | def make_node(self, *inputs): 62 | inputs = map(theano.tensor.as_tensor_variable, inputs) 63 | # add checks here for types and numdims of all inputs 64 | return theano.Apply(self, inputs, [T.ftensor3()]) 65 | 66 | def perform(self, node, inputs, outputs): 67 | inputs[0] = inputs[0].astype(np.float32) 68 | inputs[1] = inputs[1].astype(np.int32) 69 | inputs[2] = inputs[2].astype(np.int32) 70 | inputs[3] = inputs[3].astype(np.int32) 71 | cost, gradients = cpu_ctc_np(*inputs) 72 | outputs[0][0] = gradients 73 | 74 | class CPUCTC(theano.Op): 75 | # Properties attribute 76 | __props__ = () 77 | 78 | def make_node(self, *inputs): 79 | inputs = map(theano.tensor.as_tensor_variable, inputs) 80 | # add checks here for types and numdims of all inputs 81 | return theano.Apply(self, inputs, [T.fvector()]) 82 | 83 | def perform(self, node, inputs, outputs): 84 | inputs[0] = inputs[0].astype(np.float32) 85 | inputs[1] = inputs[1].astype(np.int32) 86 | inputs[2] = inputs[2].astype(np.int32) 87 | inputs[3] = inputs[3].astype(np.int32) 88 | cost, gradients = cpu_ctc_np(*inputs) 89 | outputs[0][0] = cost 90 | 91 | def grad(self, inputs, output_grads): 92 | gradients = CPUCTCGrad()(*inputs) 93 | return [gradients, 94 | grad_undefined(self, 1, inputs[1]), 95 | grad_undefined(self, 2, inputs[2]), 96 | grad_undefined(self, 3, inputs[3])] 97 | 98 | cpu_ctc_th = CPUCTC() 99 | 100 | 101 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/python/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | setup(name='ctc', 3 | version='0.1', 4 | packages=['ctc'], 5 | package_dir={'ctc': '.'}, 6 | package_data={'ctc' : ['../build/libwarpctc.*']} 7 | ) 8 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/src/ctc_entrypoint.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #include "detail/cpu_ctc.h" 8 | #ifdef __CUDACC__ 9 | #include "detail/gpu_ctc.h" 10 | #endif 11 | 12 | 13 | extern "C" { 14 | 15 | const char* ctcGetStatusString(ctcStatus_t status) { 16 | switch (status) { 17 | case CTC_STATUS_SUCCESS: 18 | return "no error"; 19 | case CTC_STATUS_MEMOPS_FAILED: 20 | return "cuda memcpy or memset failed"; 21 | case CTC_STATUS_INVALID_VALUE: 22 | return "invalid value"; 23 | case CTC_STATUS_EXECUTION_FAILED: 24 | return "execution failed"; 25 | 26 | case CTC_STATUS_UNKNOWN_ERROR: 27 | default: 28 | return "unknown error"; 29 | 30 | } 31 | 32 | } 33 | 34 | inline void throw_on_error(ctcStatus_t status, const char* message) { 35 | if (status != CTC_STATUS_SUCCESS) { 36 | throw std::runtime_error(message + (", stat = " + 37 | std::string(ctcGetStatusString(status)))); 38 | } 39 | } 40 | 41 | 42 | ctcStatus_t compute_ctc_loss(const float* const activations, 43 | float* gradients, 44 | const int* const flat_labels, 45 | const int* const label_lengths, 46 | const int* const input_lengths, 47 | int alphabet_size, 48 | int minibatch, 49 | float *costs, 50 | void *workspace, 51 | ctcComputeInfo info) { 52 | 53 | if (activations == nullptr || 54 | flat_labels == nullptr || 55 | label_lengths == nullptr || 56 | input_lengths == nullptr || 57 | costs == nullptr || 58 | workspace == nullptr || 59 | alphabet_size <= 0 || 60 | minibatch <= 0) 61 | return CTC_STATUS_INVALID_VALUE; 62 | 63 | if (info.loc == CTC_CPU) { 64 | CpuCTC ctc(alphabet_size, minibatch, workspace, info.num_threads); 65 | 66 | if (gradients != NULL) 67 | return ctc.cost_and_grad(activations, gradients, 68 | costs, 69 | flat_labels, label_lengths, 70 | input_lengths); 71 | else 72 | return ctc.score_forward(activations, costs, flat_labels, 73 | label_lengths, input_lengths); 74 | } else if (info.loc == CTC_GPU) { 75 | #ifdef __CUDACC__ 76 | GpuCTC ctc(alphabet_size, minibatch, workspace, info.stream); 77 | 78 | if (gradients != NULL) 79 | return ctc.cost_and_grad(activations, gradients, costs, 80 | flat_labels, label_lengths, 81 | input_lengths); 82 | else 83 | return ctc.score_forward(activations, costs, flat_labels, 84 | label_lengths, input_lengths); 85 | #else 86 | std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl; 87 | return CTC_STATUS_EXECUTION_FAILED; 88 | #endif 89 | } else { 90 | return CTC_STATUS_INVALID_VALUE; 91 | } 92 | } 93 | 94 | 95 | ctcStatus_t get_workspace_size(const int* const label_lengths, 96 | const int* const input_lengths, 97 | int alphabet_size, int minibatch, 98 | ctcComputeInfo info, 99 | size_t* size_bytes) 100 | { 101 | if (label_lengths == nullptr || 102 | input_lengths == nullptr || 103 | size_bytes == nullptr || 104 | alphabet_size <= 0 || 105 | minibatch <= 0) 106 | return CTC_STATUS_INVALID_VALUE; 107 | 108 | // This is the max of all S and T for all examples in the minibatch. 109 | int maxL = *std::max_element(label_lengths, label_lengths + minibatch); 110 | int maxT = *std::max_element(input_lengths, input_lengths + minibatch); 111 | 112 | const int S = 2 * maxL + 1; 113 | 114 | *size_bytes = 0; 115 | 116 | if (info.loc == CTC_GPU) { 117 | // GPU storage 118 | //nll_forward, nll_backward 119 | *size_bytes += 2 * sizeof(float) * minibatch; 120 | 121 | //repeats 122 | *size_bytes += sizeof(int) * minibatch; 123 | 124 | //label offsets 125 | *size_bytes += sizeof(int) * minibatch; 126 | 127 | //utt_length 128 | *size_bytes += sizeof(int) * minibatch; 129 | 130 | //label lengths 131 | *size_bytes += sizeof(int) * minibatch; 132 | 133 | //labels without blanks - overallocate for now 134 | *size_bytes += sizeof(int) * maxL * minibatch; 135 | 136 | //labels with blanks 137 | *size_bytes += sizeof(int) * S * minibatch; 138 | 139 | //alphas 140 | *size_bytes += sizeof(float) * S * maxT * minibatch; 141 | 142 | //denoms 143 | *size_bytes += sizeof(float) * maxT * minibatch; 144 | 145 | //probs (since we will pass in activations) 146 | *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch; 147 | 148 | } else { 149 | //cpu can eventually replace all minibatch with 150 | //max number of concurrent threads if memory is 151 | //really tight 152 | 153 | //per minibatch memory 154 | size_t per_minibatch_bytes = 0; 155 | 156 | //output 157 | per_minibatch_bytes += sizeof(float) * alphabet_size ; 158 | 159 | //alphas 160 | per_minibatch_bytes += sizeof(float) * S * maxT; 161 | 162 | //betas 163 | per_minibatch_bytes += sizeof(float) * S; 164 | 165 | //labels w/blanks, e_inc, s_inc 166 | per_minibatch_bytes += 3 * sizeof(int) * S; 167 | 168 | *size_bytes = per_minibatch_bytes * minibatch; 169 | 170 | //probs 171 | *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch; 172 | } 173 | 174 | return CTC_STATUS_SUCCESS; 175 | } 176 | 177 | /* 178 | Simple wrappers for neon compatibility 179 | */ 180 | #ifdef __CUDACC__ 181 | int get_workspace_size_gpu(const int* const label_lengths, 182 | const int* const input_lengths, 183 | int alphabet_size, int minibatch, 184 | cudaStream_t stream) { 185 | ctcComputeInfo info; 186 | info.loc = CTC_GPU; 187 | info.stream = stream; 188 | 189 | size_t size_bytes; 190 | get_workspace_size(label_lengths, input_lengths, alphabet_size, 191 | minibatch, info, &size_bytes); 192 | 193 | return int(size_bytes); 194 | } 195 | 196 | 197 | int compute_ctc_loss_gpu(const float* const activations, 198 | float* gradients, 199 | const int* const flat_labels, 200 | const int* const label_lengths, 201 | const int* const input_lengths, 202 | int alphabet_size, 203 | int minibatch, 204 | float *costs, 205 | void *workspace, 206 | cudaStream_t stream) { 207 | 208 | ctcComputeInfo info; 209 | info.loc = CTC_GPU; 210 | info.stream = stream; 211 | 212 | ctcStatus_t status = compute_ctc_loss(activations, 213 | gradients, 214 | flat_labels, 215 | label_lengths, 216 | input_lengths, 217 | alphabet_size, 218 | minibatch, 219 | costs, 220 | workspace, 221 | info); 222 | 223 | // Maybe call throw_on_error here? 224 | return int(status); 225 | 226 | } 227 | #endif 228 | 229 | int compute_ctc_loss_cpu(const float* const activations, 230 | float* gradients, 231 | const int* const flat_labels, 232 | const int* const label_lengths, 233 | const int* const input_lengths, 234 | int alphabet_size, 235 | int minibatch, 236 | float *costs, 237 | int num_threads) { 238 | ctcComputeInfo info; 239 | info.loc = CTC_CPU; 240 | info.num_threads = num_threads; 241 | 242 | size_t size_bytes; 243 | get_workspace_size(label_lengths, 244 | input_lengths, 245 | alphabet_size, 246 | minibatch, 247 | info, 248 | &size_bytes); 249 | 250 | void* workspace = malloc(size_bytes); 251 | 252 | ctcStatus_t status = compute_ctc_loss(activations, 253 | gradients, 254 | flat_labels, 255 | label_lengths, 256 | input_lengths, 257 | alphabet_size, 258 | minibatch, 259 | costs, 260 | workspace, 261 | info); 262 | free(workspace); 263 | 264 | // Maybe call throw_on_error here? 265 | return int(status); 266 | } 267 | } 268 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/src/ctc_entrypoint.cu: -------------------------------------------------------------------------------- 1 | ctc_entrypoint.cpp -------------------------------------------------------------------------------- /src/transforms/warp-ctc/src/reduce.cu: -------------------------------------------------------------------------------- 1 | // Includes, system 2 | // #include 3 | // #include 4 | 5 | // Includes, cuda 6 | // #include 7 | // #include 8 | 9 | // Includes, cuda helper functions 10 | // #include 11 | 12 | // For the functors 13 | #include "detail/ctc_helper.h" 14 | #include "ctc.h" 15 | 16 | const int warp_size = 32; 17 | 18 | template 19 | struct CTAReduce; 20 | 21 | template 22 | struct CTAReduce { 23 | enum { Size = NT, Capacity = NT }; 24 | struct Storage { T shared[Capacity]; }; 25 | 26 | __device__ static T reduce(int tid, T x, Storage& storage, int count, Rop g) { 27 | T* s = storage.shared; 28 | s[tid] = x; 29 | __syncthreads(); 30 | 31 | // Fold the data in half with each pass. 32 | #pragma unroll 33 | for(int offset = NT / 2; offset >= warp_size; offset /= 2) { 34 | if(tid + offset < count && tid < offset) { 35 | // Read from the right half and store to the left half. 36 | x = g(x, s[offset + tid]); 37 | s[tid] = x; 38 | } 39 | __syncthreads(); 40 | } 41 | 42 | T shuff; 43 | for (int offset = warp_size / 2; offset > 0; offset /= 2) { 44 | shuff = __shfl_down(x, offset); 45 | if (tid + offset < count && tid < offset) 46 | x = g(x, shuff); 47 | } 48 | return x; 49 | } 50 | }; 51 | 52 | template 53 | __global__ void reduce_rows(Iop f, Rop g, const T* input, T* output, 54 | int num_rows, int num_cols) { 55 | 56 | typedef CTAReduce R; 57 | __shared__ typename R::Storage storage; 58 | 59 | int tid = threadIdx.x; 60 | int idx = tid; 61 | int col = blockIdx.x; 62 | T curr; 63 | 64 | // Each block works on a column 65 | if (idx < num_rows) 66 | curr = f(input[idx + col*num_rows]); 67 | idx += NT; 68 | 69 | 70 | while (idx < num_rows) { 71 | curr += f(input[idx + col*num_rows]); 72 | idx += NT; 73 | } 74 | 75 | // Sum thread-totals over the CTA. 76 | curr = R::reduce(tid, curr, storage, num_rows, g); 77 | 78 | // Store result in out 79 | if (tid == 0) 80 | output[col] = curr; 81 | } 82 | 83 | template 84 | __global__ void reduce_cols(Iop f, Rop g, const T* input, T* output, 85 | int num_rows, int num_cols) { 86 | 87 | __shared__ T s[NT]; 88 | 89 | int warps_per_block = NT / warp_size; 90 | int row = blockDim.x * blockIdx.x + threadIdx.x; 91 | int col = threadIdx.y; 92 | T curr; 93 | 94 | if (row < num_rows && col < num_cols) { 95 | curr = f(input[row + col*num_rows]); 96 | col += blockDim.y; 97 | while (col < num_cols) { 98 | curr = g(curr, f(input[row + col*num_rows])); 99 | col += blockDim.y; 100 | } 101 | } 102 | s[threadIdx.x * warps_per_block + threadIdx.y] = curr; 103 | __syncthreads(); 104 | 105 | // Reduce 106 | if (threadIdx.y == 0 && row < num_rows) { 107 | #pragma unroll 108 | for (int i = 1; i < warps_per_block && i < num_cols; ++i) 109 | curr = g(curr, s[i + threadIdx.x * warps_per_block]); 110 | output[row] = curr; 111 | } 112 | } 113 | 114 | struct ReduceHelper { 115 | 116 | template 117 | static void impl(Iof f, Rof g, const T* input, T* output, int num_rows, int num_cols, bool axis, cudaStream_t stream) { 118 | 119 | int grid_size; 120 | 121 | if (axis) { 122 | grid_size = num_cols; 123 | reduce_rows<128><<>> 124 | (f, g, input, output, num_rows, num_cols); 125 | 126 | } else { 127 | dim3 tpb(warp_size, 128 / warp_size); 128 | grid_size = (num_cols + warp_size - 1)/warp_size; 129 | reduce_cols<128><<>> 130 | (f, g, input, output, num_rows, num_cols); 131 | 132 | } 133 | } 134 | }; 135 | 136 | 137 | template 138 | ctcStatus_t reduce(Iof f, Rof g, const T* input, T* output, int rows, int cols, bool axis, cudaStream_t stream) { 139 | ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream); 140 | cudaStreamSynchronize(stream); 141 | cudaError_t err = cudaGetLastError(); 142 | if (err != cudaSuccess) 143 | return CTC_STATUS_EXECUTION_FAILED; 144 | 145 | return CTC_STATUS_SUCCESS; 146 | } 147 | 148 | ctcStatus_t reduce_negate(const float *input, float *output, int rows, int cols, bool axis, cudaStream_t stream) { 149 | return reduce(ctc_helper::negate(), ctc_helper::add(), input, output, rows, cols, axis, stream); 150 | } 151 | 152 | ctcStatus_t reduce_exp(const float *input, float *output, int rows, int cols, bool axis, cudaStream_t stream) { 153 | return reduce(ctc_helper::exponential(), ctc_helper::add(), input, output, rows, cols, axis, stream); 154 | } 155 | 156 | ctcStatus_t reduce_max(const float *input, float *output, int rows, int cols, bool axis, cudaStream_t stream) { 157 | return reduce(ctc_helper::identity(), ctc_helper::maximum(),input, output, rows, cols, axis, stream); 158 | } 159 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/tests/test.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | inline void throw_on_error(ctcStatus_t status, const char* message) { 10 | if (status != CTC_STATUS_SUCCESS) { 11 | throw std::runtime_error(message + (", stat = " + 12 | std::string(ctcGetStatusString(status)))); 13 | } 14 | } 15 | 16 | #ifdef __CUDACC__ 17 | #include 18 | #include 19 | 20 | inline void throw_on_error(cudaError_t error, const char* message) { 21 | if (error) { 22 | throw thrust::system_error(error, thrust::cuda_category(), message); 23 | } 24 | } 25 | 26 | #endif 27 | 28 | std::vector 29 | genActs(int size) { 30 | std::vector arr(size); 31 | std::mt19937 gen(0); 32 | std::uniform_real_distribution<> dis(0, 1); 33 | for(int i = 0; i < size; ++i) 34 | arr[i] = dis(gen); 35 | return arr; 36 | } 37 | 38 | std::vector 39 | genLabels(int alphabet_size, int L) { 40 | std::vector label(L); 41 | 42 | std::mt19937 gen(1); 43 | std::uniform_int_distribution<> dis(1, alphabet_size - 1); 44 | 45 | for(int i = 0; i < L; ++i) { 46 | label[i] = dis(gen); 47 | } 48 | // guarantee repeats for testing 49 | if (L >= 3) { 50 | label[L / 2] = label[L / 2 + 1]; 51 | label[L / 2 - 1] = label[L / 2]; 52 | } 53 | return label; 54 | } 55 | 56 | float rel_diff(const std::vector& grad, 57 | const std::vector& num_grad) { 58 | float diff = 0.; 59 | float tot = 0.; 60 | for(size_t idx = 0; idx < grad.size(); ++idx) { 61 | diff += (grad[idx] - num_grad[idx]) * (grad[idx] - num_grad[idx]); 62 | tot += grad[idx] * grad[idx]; 63 | } 64 | 65 | return diff / tot; 66 | } 67 | 68 | // Numerically stable softmax for a minibatch of 1 69 | void softmax(const float* const acts, 70 | int alphabet_size, int T, 71 | float *probs) { 72 | 73 | for (int t = 0; t < T; ++t) { 74 | 75 | float max_activation = 76 | -std::numeric_limits::infinity(); 77 | 78 | for (int a = 0; a < alphabet_size; ++a) 79 | max_activation = 80 | std::max(max_activation, acts[t*alphabet_size + a]); 81 | 82 | float denom = 0; 83 | for (int a = 0; a < alphabet_size; ++a) 84 | denom += std::exp(acts[t*alphabet_size + a] - max_activation); 85 | 86 | for (int a = 0; a < alphabet_size; ++a) 87 | probs[t*alphabet_size + a] = 88 | std::exp(acts[t*alphabet_size + a] - max_activation) / denom; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/transforms/warp-ctc/tests/test_cpu.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | 8 | #include 9 | 10 | #include "test.h" 11 | 12 | bool small_test() { 13 | const int alphabet_size = 5; 14 | const int T = 2; 15 | 16 | std::vector activations = {0.1, 0.6, 0.1, 0.1, 0.1, 17 | 0.1, 0.1, 0.6, 0.1, 0.1}; 18 | 19 | // Calculate the score analytically 20 | float expected_score; 21 | { 22 | std::vector probs(activations.size()); 23 | softmax(activations.data(), alphabet_size, T, probs.data()); 24 | 25 | // Score calculation is specific to the given activations above 26 | expected_score = probs[1] * probs[7]; 27 | } 28 | 29 | std::vector labels = {1, 2}; 30 | std::vector label_lengths = {2}; 31 | 32 | std::vector lengths; 33 | lengths.push_back(T); 34 | 35 | float score; 36 | 37 | ctcComputeInfo info; 38 | info.loc = CTC_CPU; 39 | info.num_threads = 1; 40 | 41 | size_t cpu_alloc_bytes; 42 | throw_on_error(get_workspace_size(label_lengths.data(), lengths.data(), 43 | alphabet_size, lengths.size(), info, 44 | &cpu_alloc_bytes), 45 | "Error: get_workspace_size in small_test"); 46 | 47 | void* ctc_cpu_workspace = malloc(cpu_alloc_bytes); 48 | 49 | throw_on_error(compute_ctc_loss(activations.data(), NULL, 50 | labels.data(), label_lengths.data(), 51 | lengths.data(), 52 | alphabet_size, 53 | lengths.size(), 54 | &score, 55 | ctc_cpu_workspace, 56 | info), 57 | "Error: compute_ctc_loss in small_test"); 58 | 59 | free(ctc_cpu_workspace); 60 | score = std::exp(-score); 61 | const float eps = 1e-6; 62 | 63 | const float lb = expected_score - eps; 64 | const float ub = expected_score + eps; 65 | 66 | return (score > lb && score < ub); 67 | } 68 | 69 | bool inf_test() { 70 | const int alphabet_size = 15; 71 | const int T = 50; 72 | const int L = 10; 73 | const int minibatch = 1; 74 | 75 | std::vector labels = genLabels(alphabet_size, L); 76 | labels[0] = 2; 77 | std::vector label_lengths = {L}; 78 | 79 | std::vector acts = genActs(alphabet_size * T * minibatch); 80 | 81 | for (int i = 0; i < T; ++i) 82 | acts[alphabet_size * i + 2] = -1e30; 83 | 84 | std::vector sizes; 85 | sizes.push_back(T); 86 | 87 | std::vector grads(alphabet_size * T); 88 | 89 | float cost; 90 | 91 | ctcComputeInfo info; 92 | info.loc = CTC_CPU; 93 | info.num_threads = 1; 94 | 95 | size_t cpu_alloc_bytes; 96 | throw_on_error(get_workspace_size(label_lengths.data(), sizes.data(), 97 | alphabet_size, sizes.size(), info, 98 | &cpu_alloc_bytes), 99 | "Error: get_workspace_size in inf_test"); 100 | 101 | void* ctc_cpu_workspace = malloc(cpu_alloc_bytes); 102 | 103 | throw_on_error(compute_ctc_loss(acts.data(), grads.data(), 104 | labels.data(), label_lengths.data(), 105 | sizes.data(), 106 | alphabet_size, 107 | sizes.size(), 108 | &cost, 109 | ctc_cpu_workspace, 110 | info), 111 | "Error: compute_ctc_loss in inf_test"); 112 | 113 | free(ctc_cpu_workspace); 114 | 115 | bool status = true; 116 | status &= std::isinf(cost); 117 | 118 | for (int i = 0; i < alphabet_size * T; ++i) 119 | status &= !std::isnan(grads[i]); 120 | 121 | return status; 122 | } 123 | 124 | float grad_check(int T, int alphabet_size, 125 | std::vector& acts, 126 | const std::vector>& labels, 127 | const std::vector& sizes) { 128 | 129 | float epsilon = 1e-2; 130 | 131 | const int minibatch = labels.size(); 132 | 133 | std::vector flat_labels; 134 | std::vector label_lengths; 135 | for (const auto& l : labels) { 136 | flat_labels.insert(flat_labels.end(), l.begin(), l.end()); 137 | label_lengths.push_back(l.size()); 138 | } 139 | 140 | std::vector costs(minibatch); 141 | 142 | std::vector grads(acts.size()); 143 | 144 | ctcComputeInfo info; 145 | info.loc = CTC_CPU; 146 | info.num_threads = 1; 147 | 148 | size_t cpu_alloc_bytes; 149 | throw_on_error(get_workspace_size(label_lengths.data(), sizes.data(), 150 | alphabet_size, sizes.size(), info, 151 | &cpu_alloc_bytes), 152 | "Error: get_workspace_size in grad_check"); 153 | 154 | void* ctc_cpu_workspace = malloc(cpu_alloc_bytes); 155 | 156 | throw_on_error(compute_ctc_loss(acts.data(), grads.data(), 157 | flat_labels.data(), label_lengths.data(), 158 | sizes.data(), 159 | alphabet_size, 160 | minibatch, 161 | costs.data(), 162 | ctc_cpu_workspace, 163 | info), 164 | "Error: compute_ctc_loss (0) in grad_check"); 165 | 166 | float cost = std::accumulate(costs.begin(), costs.end(), 0.); 167 | 168 | std::vector num_grad(grads.size()); 169 | 170 | //perform 2nd order central differencing 171 | for (int i = 0; i < T * alphabet_size * minibatch; ++i) { 172 | 173 | std::vector costsP1(minibatch); 174 | std::vector costsP2(minibatch); 175 | 176 | acts[i] += epsilon; 177 | throw_on_error(compute_ctc_loss(acts.data(), NULL, 178 | flat_labels.data(), label_lengths.data(), 179 | sizes.data(), 180 | alphabet_size, 181 | minibatch, 182 | costsP1.data(), 183 | ctc_cpu_workspace, 184 | info), 185 | "Error: compute_ctc_loss (1) in grad_check"); 186 | 187 | acts[i] -= 2 * epsilon; 188 | throw_on_error(compute_ctc_loss(acts.data(), NULL, 189 | flat_labels.data(), label_lengths.data(), 190 | sizes.data(), 191 | alphabet_size, 192 | minibatch, 193 | costsP2.data(), 194 | ctc_cpu_workspace, 195 | info), 196 | "Error: compute_ctc_loss (2) in grad_check"); 197 | 198 | float costP1 = std::accumulate(costsP1.begin(), costsP1.end(), 0.); 199 | float costP2 = std::accumulate(costsP2.begin(), costsP2.end(), 0.); 200 | 201 | acts[i] += epsilon; 202 | num_grad[i] = (costP1 - costP2) / (2 * epsilon); 203 | } 204 | 205 | free(ctc_cpu_workspace); 206 | 207 | float diff = rel_diff(grads, num_grad); 208 | 209 | return diff; 210 | } 211 | 212 | bool run_tests() { 213 | std::vector> problem_sizes = 214 | {std::make_tuple(20, 50, 15, 1, 1e-5), 215 | std::make_tuple(5, 10, 5, 65, 1e-4) 216 | }; 217 | 218 | std::mt19937 gen(2); 219 | 220 | bool status = true; 221 | for (auto problem : problem_sizes) { 222 | int alphabet_size, T, L, minibatch; 223 | float tol; 224 | std::tie(alphabet_size, T, L, minibatch, tol) = problem; 225 | 226 | std::vector acts = genActs(alphabet_size * T * minibatch); 227 | 228 | std::vector> labels; 229 | std::vector sizes; 230 | for (int mb = 0; mb < minibatch; ++mb) { 231 | int actual_length = L; 232 | labels.push_back(genLabels(alphabet_size, actual_length)); 233 | sizes.push_back(T); 234 | } 235 | 236 | float diff = grad_check(T, alphabet_size, acts, labels, sizes); 237 | 238 | status &= (diff < tol); 239 | } 240 | 241 | return status; 242 | } 243 | 244 | int main(void) { 245 | std::cout << "Running CPU tests" << std::endl; 246 | 247 | bool status = true; 248 | status &= small_test(); 249 | status &= inf_test(); 250 | status &= run_tests(); 251 | 252 | if (status) 253 | std::cout << "Tests pass" << std::endl; 254 | else 255 | std::cout << "Some or all tests fail" << std::endl; 256 | } 257 | --------------------------------------------------------------------------------