├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── requirements.txt
├── speech
    ├── __init__.py
    ├── ctc.py
    ├── data
    │   ├── __init__.py
    │   ├── dataloader.py
    │   └── ingest_librispeech.py
    ├── decoder.py
    ├── evaluate.py
    ├── sample_proposals_callback.py
    ├── train.py
    └── utils.py
└── src
    └── transforms
        ├── .gitignore
        ├── Makefile
        └── warp-ctc
            ├── CMakeLists.txt
            ├── LICENSE
            ├── README.md
            ├── doc
                ├── baidu-research-logo-small.png
                └── deep-speech-ctc-small.png
            ├── examples
                ├── loader.py
                ├── rnnctc.py
                └── simple.py
            ├── include
                ├── contrib
                │   └── moderngpu
                │   │   ├── LICENSE
                │   │   └── include
                │   │       ├── device
                │   │           ├── ctaloadbalance.cuh
                │   │           ├── ctamerge.cuh
                │   │           ├── ctascan.cuh
                │   │           ├── ctasearch.cuh
                │   │           ├── ctasegreduce.cuh
                │   │           ├── ctasegscan.cuh
                │   │           ├── ctasegsort.cuh
                │   │           ├── ctasortedsearch.cuh
                │   │           ├── devicetypes.cuh
                │   │           ├── deviceutil.cuh
                │   │           ├── intrinsics.cuh
                │   │           ├── loadstore.cuh
                │   │           ├── serialsets.cuh
                │   │           └── sortnetwork.cuh
                │   │       ├── mgpudevice.cuh
                │   │       ├── mgpuenums.h
                │   │       └── util
                │   │           └── static.h
                ├── ctc.h
                └── detail
                │   ├── cpu_ctc.h
                │   ├── ctc_helper.h
                │   ├── gpu_ctc.h
                │   ├── gpu_ctc_kernels.h
                │   ├── hostdevice.h
                │   └── reduce.h
            ├── python
                ├── __init__.py
                ├── ctc.py
                └── setup.py
            ├── src
                ├── ctc_entrypoint.cpp
                ├── ctc_entrypoint.cu
                └── reduce.cu
            └── tests
                ├── test.h
                ├── test_cpu.cpp
                └── test_gpu.cu


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.sublime-project
 2 | *.sublime-workspace
 3 | *.pyc
 4 | *.pkl
 5 | *.prm
 6 | *.so
 7 | *.swo
 8 | *.swp
 9 | .DS_Store
10 | neon/version.py
11 | *@eaDir
12 | .pkgs
13 | *.egg-info
14 | .venv
15 | .venv[23]
16 | .styleenv
17 | .coverage
18 | build
19 | *.gz
20 | generated
21 | *.ropeproject
22 | *.cubin
23 | *.hdf5
24 | *.h5
25 | *.html
26 | *.txt
27 | *.log
28 | neon_help_output.txt
29 | neon/backends/util/cuda_capability
30 | neon/backends/kernels/ptx
31 | neon/backends/kernels/pre
32 | neon/backends/kernels/dump
33 | neon/data/loader/loader
34 | .idea/
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: default all clean
 2 | 
 3 | default: all
 4 | 
 5 | all: src/transforms/libwarpctc.so
 6 | 
 7 | clean:
 8 | 	@$(MAKE) -C src/transforms clean
 9 | 	@find . -name '*.pyc' -delete
10 | 
11 | src/transforms/libwarpctc.so:
12 | 	@$(MAKE) -C src/transforms
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # DISCONTINUATION OF PROJECT #
  2 | This project will no longer be maintained by Intel.
  3 | Intel has ceased development and contributions including, but not limited to, maintenance, bug fixes, new releases, or updates, to this project.
  4 | Intel no longer accepts patches to this project.
  5 | # Implementation of Deep Speech 2 in neon
  6 | 
  7 | This repository contains an implementation of Baidu SVAIL's [Deep Speech 2] 
  8 | model in neon. Much of the model is readily available in mainline neon; to also 
  9 | support the CTC cost function, we have included a neon-compatible wrapper for 
 10 | Baidu's [Warp-CTC].
 11 |   
 12 | Deep Speech 2 models are computationally intensive, and thus they can
 13 | require long periods of time to run. Even with near-perfect GPU utilization, 
 14 | the model can take up to 1 week to train on large enough datasets to see 
 15 | respectable performance. Please keep this in mind when exploring this repo. 
 16 | 
 17 | We have used this code to train models on both the Wall Street Journal 
 18 | (81 hours) and Librispeech (1000 hours) datasets. The WSJ dataset is 
 19 | available through the LDC only; however, Librispeech can be freely acquired 
 20 | from [Librispeech corpus].
 21 |  
 22 | The model presented here uses a basic argmax-based decoder:
 23 | 
 24 | * Choose the most probable character in each frame 
 25 | * Collapse the resulting output string according to CTC's rules: remove repeat 
 26 |   characters first, remove blank characters next.
 27 | 
 28 | After decoding, you might expect outputs like this when trained on WSJ data:
 29 | 
 30 | | Ground truth                    | Model output                      |
 31 | |---------------------------------|-----------------------------------|
 32 | | united presidential is a life insurance company | younited presidentiol is a lefe in surance company |
 33 | | that was certainly true last week | that was sertainly true last week |
 34 | | we're not ready to say we're in technical default a spokesman said | we're now ready to say we're intechnical default a spokesman said | 
 35 | 
 36 | Or outputs like this when trained on Librispeech (see "Decoding and 
 37 | evaluating a trained model"):
 38 | 
 39 | | Ground truth                    | Model output                      |
 40 | |---------------------------------|-----------------------------------|
 41 | | this had some effect in calming him | this had some offectind calming him |
 42 | | he went in and examined his letters but there was nothing from carrie | he went in an examined his letters but there was nothing from carry |
 43 | | the design was different but the thing was clearly the same | the design was differampat that thing was clarly the same |
 44 | 
 45 | ## Getting Started
 46 | 1. [neon 2.3.0] and the [aeon] dataloader (v1.0.0) must both be installed.  
 47 | 
 48 | 2. Clone the repo: ```git clone https://github.com/NervanaSystems/deepspeech.git && cd deepspeech```.
 49 | 
 50 | 3. Within a neon virtualenv, run ```pip install -r requirements.txt```.
 51 | 
 52 | 4. Run ```make``` to build warp-ctc.
 53 | 
 54 | ## Training a model
 55 | ### 1. Prepare a manifest file for your dataset.
 56 | The details on how to go about doing this are determined by the specifics of 
 57 | the dataset. 
 58 | 
 59 | 
 60 | #### Example: Librispeech recipe
 61 | A recipe for ingesting Librispeech data is provided in ``data/ingest_librispeech.py``. 
 62 | Note that Librispeech provides distinct datasets for training and validation, 
 63 | and each set must be ingested separately. Additionally, we'll have to 
 64 | get around the quirky way that the Librispeech data is distributed; after 
 65 | "unpacking" the archives, we should re-pack them in a consistent manner.
 66 | 
 67 | To be more precise, Librispeech data is distributed in zipped tar files, e.g. 
 68 | `train-clean-100.tar.gz` for training and `dev-clean.tar.gz` for validation. 
 69 | Upon unpacking, each archive creates a directory named ``LibriSpeech``, so 
 70 | trying to unpack both files together in the same directory is a bad idea. To 
 71 | get around this, try something like:
 72 | 
 73 | ```
 74 | $ mkdir librispeech && cd librispeech
 75 | $ wget http://www.openslr.org/resources/12/train-clean-100.tar.gz
 76 | $ wget http://www.openslr.org/resources/12/dev-clean.tar.gz
 77 | $ tar xvzf dev-clean.tar.gz LibriSpeech/dev-clean  --strip-components=1
 78 | $ tar xvzf train-clean-100.tar.gz LibriSpeech/train-clean-100  --strip-components=1
 79 | ```
 80 | 
 81 | Follow the above prescription and you will have the training data as a 
 82 | subdirectory `librispeech/train-clean-100` and the validation data in a 
 83 | subdirectory `librispeech/dev-clean`. To ingest the data, you would then run the
 84 | python script on the directory where you've unpacked the clean training data,
 85 | followed by directions to where you want the script to write the transcripts and
 86 | training mainfests for that dataset:
 87 | 
 88 | ```
 89 | $ python data/ingest_librispeech.py <absolute path to train-clean-100 directory> <absolute path to directory to write transcripts to> <absolute path to where to write training manifest to>
 90 | ```
 91 | 
 92 | For example, if the absolute path to the train-clean-100 directory is located in
 93 | ``/usr/local/data/librispeech/train-clean-100``, run:
 94 | 
 95 | ```
 96 | $ python data/ingest_librispeech.py  /usr/local/data/librispeech/train-clean-100  /usr/local/data/librispeech/train-clean-100/transcripts_dir  /usr/local/data/librispeech/train-clean-100/train-manifest.csv
 97 | ```
 98 | 
 99 | which would create a training manifest file named train-manifest.csv. Similarly, 
100 | if the absolute path to the dev-clean directory is located at 
101 | ``/usr/local/data/librispeech/dev-clean``, run:  
102 | 
103 | ```
104 | $ python data/ingest_librispeech.py  /usr/local/data/librispeech/dev-clean  /usr/local/data/librispeech/dev-clean/transcripts_dir  /usr/local/data/librispeech/train-clean-100/val-manifest.csv
105 | ```
106 | 
107 | To train on the full 1000 hours, execute the same commands for the 360 hour 
108 | and 540 hour training datasets as well. The manifest files can then be 
109 | concatenated with a simple: 
110 | ```
111 | $ cat /path/to/100_hour_manifest.csv /path/to/360_hour_manifest.csv /path/to/540_hour_manifest.csv > /path/to/1000_hour_manifest.csv
112 | ``` 
113 | 
114 | 
115 | ### 2a. Train a new model
116 | 
117 | ```
118 | $ python train.py --manifest train:<training manifest> --manifest val:<validation manifest> -e <num_epochs> -z <batch_size> -s </path/to/model_output.pkl> [-b <backend>] 
119 | ```
120 | 
121 | where `<training manifest>` is the path to the training manifest file produced 
122 | in the ingest. For the example above, that path is ``/usr/local/data/librispeech/train-clean-100/train-manifest.csv``) 
123 | and `<validation manifest>` is the path to the validation manifest file.
124 |  
125 | ### 2b. Continue training after pause on a previous model
126 | For a previously-trained model that wasn't trained for the full time needed, it's
127 | possible to resume training by passing the `--model_file </path/to/pre-trained_model>` 
128 | argument to `train.py`. For example, you could continue training a pre-trained 
129 | model from our [Model Zoo] sample. 
130 | This particular model was trained using 1000 hours of speech data from the 
131 | [Librispeech corpus]. The model was trained for 
132 | 16 epochs after attaining a Character Error Rate (CER) of 14% without using a 
133 | language model. You could continue training it for, say, an additional 4 epochs, 
134 | by calling:
135 | 
136 | ```
137 | $ python train.py --manifest train:<training manifest> --manifest val:<validation manifest> -e20  -z <batch_size> -s </path/to/model_output.prm> --model_file </path/to/pre-trained_model> [-b <backend>] 
138 | ```
139 | 
140 | which will save a new model to `model_output.prm`. 
141 | 
142 | ## Decoding and evaluating a trained model
143 | After you have a trained model, it's easy to evaluate its performance on any 
144 | given dataset. Simply create a manifest file and then call:
145 | 
146 | ```
147 | $ python evaluate.py --manifest val:/path/to/manifest.csv --model_file /path/to/saved_model.prm
148 | ```
149 | 
150 | replacing the file paths as needed. It prints CERs (Character Error Rates) by 
151 | default. To instead print WERs (Word Error Rates), include the argument 
152 | `--use_wer`.
153 | 
154 | For example, you could evaluate our pre-trained model from our [Model Zoo]. To 
155 | evaluate the pre-trained model, follow these steps: 
156 | 
157 | 1. Download some test data from the Librispeech ASR corpus and prepare a 
158 |    manifest file for the dataset that follows the prescription provided above.  
159 | 
160 | 2. Download the [pre-trained DS2 model from our Model Zoo].
161 | 
162 | 3. Subject the pre-trained model and the manifest file for the test data to the
163 |    `evaluate.py` script, as described above.
164 | 
165 | 4. Optionally inspect the transcripts produced by the trained model; this can
166 |    be done by appending it with the argument `--inference_file <name_of_file_to_save_results_to.pkl>`. 
167 |    The result dumps the model transcripts together with the corresponding 
168 |    "ground truth" transcripts to a pickle file. 
169 | 
170 | 
171 | [Deep Speech 2]:https://arxiv.org/abs/1512.02595
172 | [neon 2.3.0]:https://github.com/NervanaSystems/neon
173 | [aeon]:https://github.com/NervanaSystems/aeon
174 | [Warp-CTC]: https://github.com/baidu-research/warp-ctc
175 | [Librispeech corpus]:http://www.openslr.org/12
176 | [Model Zoo]:https://github.com/NervanaSystems/ModelZoo
177 | [pre-trained DS2 model from our Model Zoo]:https://s3-us-west-1.amazonaws.com/nervana-modelzoo/Deep_Speech/Librispeech/librispeech_16_epochs.prm
178 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | python-levenshtein==0.12.0
2 | tqdm==4.8.4
3 | 


--------------------------------------------------------------------------------
/speech/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NervanaSystems/deepspeech/2ea95f5ce1bb39fa4de26807beed905b7889de59/speech/__init__.py


--------------------------------------------------------------------------------
/speech/ctc.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # ----------------------------------------------------------------------------
  3 | # Copyright 2015-2016 Nervana Systems Inc.
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ----------------------------------------------------------------------------
 16 | from neon import NervanaObject
 17 | import platform
 18 | import os
 19 | import numpy as np
 20 | import numpy.ctypeslib as npct
 21 | import ctypes as ct
 22 | import ctypes.util
 23 | from neon.transforms.cost import Cost
 24 | 
 25 | 
 26 | class CTC(Cost):
 27 | 
 28 |     def __init__(self, max_label_len, nout=29, blank=0):
 29 | 
 30 |         self.max_s = int(max_label_len)
 31 |         self.nout = nout
 32 |         self.input_warp = None
 33 |         self.y = None
 34 |         self.input_lengths = self.be.zeros((self.be.bsz), dtype=np.int32)
 35 | 
 36 |         self.ctclib = None
 37 | 
 38 |     def init_buffer(self, y):
 39 | 
 40 |         if self.input_warp is None or self.y is None or self.y is not y:
 41 |             self.y = y
 42 |             # warp-CTC requires activations.shape = (T, bsz, nout)
 43 |             self.input_warp = self.be.zeros(
 44 |                 (self.max_t, self.be.bsz, self.nout))
 45 |             # warp-CTC requires gradients.shape = (T, bsz, nout)
 46 |             self.grad_warp = self.be.zeros(
 47 |                 (self.max_t, self.be.bsz, self.nout))
 48 |             # warp-CTC requires cost.shape = (1, bsz)
 49 |             self.ctc_warp = self.be.zeros((1, self.be.bsz))
 50 | 
 51 |             # neon requires gradients.shape = (nout, T*bsz)
 52 |             self.grad = self.be.iobuf((self.nout, self.max_t))
 53 |             self.grad_view = self.grad.reshape(
 54 |                 self.nout, self.max_t, self.be.bsz)
 55 |             # neon requires cost.shape = (nout, T*bsz)
 56 |             self.ctc_cost = self.be.zeros((1, self.max_t * self.be.bsz))
 57 |             self.ctc_cost_view = self.ctc_cost.reshape(
 58 |                 self.max_t, self.be.bsz).T
 59 | 
 60 |     def __call__(self, y, t):
 61 | 
 62 |         activations = y.reshape(self.nout, -1, self.be.bsz)
 63 |         self.max_t = activations.shape[1]
 64 |         self.init_buffer(y)
 65 |         self.grad_warp.fill(0.)
 66 |         self.ctc_warp.fill(0.)
 67 |         self.ctc_cost.fill(0.)
 68 |         # flat_labels: minibatch worth of transcripts
 69 |         flat_labels = t[0]
 70 | 
 71 |         # label_lengths: minibatch worth of transcript lengths
 72 |         label_lengths = t[1]
 73 | 
 74 |         # input_lengths: minibatch worth of activation lengths
 75 |         self.input_lengths[:] = t[2].T * int(activations.shape[1]) / 100
 76 | 
 77 |         # reshape activations to match warp-CTC format
 78 |         self.be.copy_transpose(activations, self.input_warp, (1, 2, 0))
 79 | 
 80 |         # call into warp-CTC
 81 |         self.be_ctc(
 82 |             self.nout,
 83 |             self.input_warp,
 84 |             flat_labels,
 85 |             self.grad_warp,
 86 |             label_lengths,
 87 |             self.input_lengths,
 88 |             self.ctc_warp)
 89 | 
 90 |         # warp-ctc only returns 1 cost for each example
 91 |         # broadcast ctc_warp (shape = (1,bsz)) to ctc_cost (shape=(1, T*bsz))
 92 |         self.ctc_cost_view[:] = self.ctc_warp.T
 93 | 
 94 |         return self.ctc_cost
 95 | 
 96 |     def be_ctc(
 97 |             self,
 98 |             nout,
 99 |             inputs,
100 |             labels,
101 |             grads,
102 |             label_lens,
103 |             input_lens,
104 |             costs):
105 | 
106 |         libpath = os.path.join(os.path.dirname(__file__),
107 |                                "..", "src", "transforms", "libwarpctc.so")
108 |         assert os.path.isfile(libpath), "libwarpctc.so not found.  Run make"
109 |         self.ctclib = npct.load_library(libpath, "")
110 | 
111 |         if self.be.backend_name == "gpu":
112 |             self.be_ctc_gpu(
113 |                 nout,
114 |                 inputs,
115 |                 labels,
116 |                 grads,
117 |                 label_lens,
118 |                 input_lens,
119 |                 costs)
120 |         elif self.be.backend_name == "cpu" or self.be.backend_name == "mkl":
121 |             self.be_ctc_cpu(
122 |                 inputs,
123 |                 labels,
124 |                 grads,
125 |                 label_lens,
126 |                 input_lens,
127 |                 costs,
128 |                 nout)
129 |         else:
130 |             raise NotImplementedError()
131 | 
132 |     def be_ctc_gpu(
133 |             self,
134 |             nout,
135 |             inputs,
136 |             labels,
137 |             grads,
138 |             label_lens,
139 |             input_lens,
140 |             costs):
141 |         """
142 |         Calling Warp-CTC
143 |         """
144 | 
145 |         # Set up cuda stream
146 |         if self.be.stream is None:
147 |             stream_buf = ct.cast(self.be.stream, ct.c_void_p)
148 |         else:
149 |             stream_buf = ct.cast(
150 |                 id(self.be.stream), ct.POINTER(ct.c_void_p)).contents
151 | 
152 |         # map first function to get workspace size
153 |         self.ctclib.get_workspace_size_gpu.restype = int
154 |         self.ctclib.get_workspace_size_gpu.argtypes = [npct.ndpointer(dtype=np.int32, ndim=1),
155 |                                                        npct.ndpointer(dtype=np.int32, ndim=1),
156 |                                                        ct.c_int,
157 |                                                        ct.c_int,
158 |                                                        ct.c_void_p]
159 |         scratch_size = self.ctclib.get_workspace_size_gpu(np.array(label_lens.get().ravel(),
160 |                                                                    dtype=np.int32),
161 |                                                           np.array(input_lens.get().ravel(),
162 |                                                                    dtype=np.int32),
163 |                                                           nout, self.be.bsz,
164 |                                                           stream_buf)
165 |         self.be.set_scratch_size(scratch_size)
166 |         workspace = self.be.scratch_buffer(scratch_size)
167 | 
168 |         # map ctc function
169 |         self.ctclib.compute_ctc_loss_gpu.restype = int
170 |         self.ctclib.compute_ctc_loss_gpu.argtypes = [ct.POINTER(ct.c_float),
171 |                                                      ct.POINTER(ct.c_float),
172 |                                                      npct.ndpointer(dtype=np.int32, ndim=1),
173 |                                                      npct.ndpointer(dtype=np.int32, ndim=1),
174 |                                                      npct.ndpointer(dtype=np.int32, ndim=1),
175 |                                                      ct.c_int,
176 |                                                      ct.c_int,
177 |                                                      ct.POINTER(ct.c_float),
178 |                                                      ct.POINTER(ct.c_char),
179 |                                                      ct.c_void_p]
180 | 
181 |         inputs_buf = ct.cast(int(inputs.gpudata), ct.POINTER(ct.c_float))
182 |         grads_buf = ct.cast(int(grads.gpudata), ct.POINTER(ct.c_float))
183 |         costs_buf = ct.cast(int(costs.gpudata), ct.POINTER(ct.c_float))
184 |         workspace_buf = ct.cast(workspace, ct.POINTER(ct.c_char))
185 | 
186 |         status = self.ctclib.compute_ctc_loss_gpu(inputs_buf,
187 |                                                   grads_buf,
188 |                                                   np.array(labels.get().ravel(),
189 |                                                            dtype=np.int32),
190 |                                                   np.array(label_lens.get().ravel(),
191 |                                                            dtype=np.int32),
192 |                                                   np.array(input_lens.get().ravel(),
193 |                                                            dtype=np.int32),
194 |                                                   nout,
195 |                                                   self.be.bsz,
196 |                                                   costs_buf,
197 |                                                   workspace_buf,
198 |                                                   stream_buf)
199 | 
200 |         assert status is 0, "Warp-CTC run failed"
201 |         return
202 | 
203 |     def be_ctc_cpu(
204 |             self,
205 |             inputs,
206 |             labels,
207 |             grads,
208 |             label_lens,
209 |             input_lens,
210 |             costs,
211 |             nout):
212 |         """
213 |         Calling Warp-CTC
214 |         """
215 | 
216 |         # Workspace is allocated in ctc_entrypoint.cpp since the CPU backend in neon doesn't have
217 |         # scratch space support
218 |         # Map compute_ctc_loss
219 |         self.ctclib.compute_ctc_loss_cpu.restype = int
220 |         self.ctclib.compute_ctc_loss_cpu.argtypes = [
221 |             npct.ndpointer(dtype=np.float32, ndim=3),
222 |             npct.ndpointer(dtype=np.float32, ndim=3),
223 |             npct.ndpointer(dtype=np.int32, ndim=1),
224 |             npct.ndpointer(dtype=np.int32, ndim=1),
225 |             npct.ndpointer(dtype=np.int32, ndim=1),
226 |             ctypes.c_int,
227 |             ctypes.c_int,
228 |             npct.ndpointer(dtype=np.float32, ndim=1),
229 |             ctypes.c_int]
230 | 
231 |         num_threads = 8
232 |         _inputs = np.array(inputs.get(), dtype=np.float32)
233 |         _grads = np.array(grads.get(), dtype=np.float32)
234 |         _labels = np.array(labels.get().ravel(), dtype=np.int32)
235 |         _label_lens = np.array(label_lens.get().ravel(), dtype=np.int32)
236 |         _input_lens = np.array(input_lens.get().ravel(), dtype=np.int32)
237 |         _costs = np.array(costs.get().ravel(), dtype=np.float32)
238 |         status = self.ctclib.compute_ctc_loss_cpu(_inputs,
239 |                                                   _grads,
240 |                                                   _labels,
241 |                                                   _label_lens,
242 |                                                   _input_lens,
243 |                                                   nout,
244 |                                                   self.be.bsz,
245 |                                                   _costs,
246 |                                                   num_threads)
247 | 
248 |         assert status is 0, "Warp-CTC run failed"
249 |         costs[:] = _costs
250 |         grads[:] = _grads
251 |         return
252 | 
253 |     def bprop(self, y, t):
254 |         # warp-ctc returns grads with shape (T, bsz, nout),
255 |         # so reshape warp-ctc grads to match neon grads
256 |         self.be.copy_transpose(self.grad_warp, self.grad_view, (2, 0, 1))
257 | 
258 |         return self.grad
259 | 


--------------------------------------------------------------------------------
/speech/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NervanaSystems/deepspeech/2ea95f5ce1bb39fa4de26807beed905b7889de59/speech/data/__init__.py


--------------------------------------------------------------------------------
/speech/data/dataloader.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # ----------------------------------------------------------------------------
 3 | # Copyright 2017 Nervana Systems Inc.
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ----------------------------------------------------------------------------
16 | import os
17 | import numpy as np
18 | from neon.data.aeon_shim import AeonDataLoader
19 | from neon.data.dataloader_transformers import TypeCast, Retuple
20 | 
21 | 
22 | def common_config(manifest_file, batch_size, alphabet, nbands, max_tscrpt_len, max_utt_len):
23 | 
24 |     audio_config = {"type": "audio",
25 |                     "sample_freq_hz": 16000,
26 |                     "max_duration": "{} seconds".format(max_utt_len),
27 |                     "frame_length": "25 milliseconds",
28 |                     "frame_stride": "10 milliseconds",
29 |                     "feature_type": "mfsc",
30 |                     "emit_length": True,
31 |                     "num_filters": nbands}
32 | 
33 |     transcription_config = {"type": "char_map",
34 |                             "alphabet": alphabet,
35 |                             "emit_length": True,
36 |                             "max_length": max_tscrpt_len}
37 | 
38 |     return {'manifest_filename': manifest_file,
39 |             'manifest_root': os.path.dirname(manifest_file),
40 |             'batch_size': batch_size,
41 |             'block_size': batch_size,
42 |             'etl': [audio_config, transcription_config]}
43 | 
44 | 
45 | def wrap_dataloader(dl):
46 |     """ Data is loaded from Aeon as a 4-tuple. We need to cast the audio
47 |     (index 0) from int8 to float32 and repack the data into (audio, 3-tuple).
48 |     """
49 | 
50 |     dl = TypeCast(dl, index=0, dtype=np.float32)
51 |     dl = Retuple(dl, data=(0,), target=(2, 3, 1))
52 |     return dl
53 | 
54 | 
55 | def make_loader(manifest_file, alphabet, nbands, max_tscrpt_len, max_utt_len, backend_obj):
56 |     aeon_config = common_config(manifest_file, backend_obj.bsz, alphabet, nbands, max_tscrpt_len,
57 |                                 max_utt_len)
58 |     return wrap_dataloader(AeonDataLoader(aeon_config))
59 | 


--------------------------------------------------------------------------------
/speech/data/ingest_librispeech.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # ----------------------------------------------------------------------------
  3 | # Copyright 2015-2016 Nervana Systems Inc.
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ----------------------------------------------------------------------------
 16 | 
 17 | import glob
 18 | import logging
 19 | import os
 20 | 
 21 | logging.basicConfig()
 22 | logger = logging.getLogger(__name__)
 23 | logger.setLevel(logging.INFO)
 24 | 
 25 | 
 26 | def write_manifest(output_file, *filenames):
 27 |     """ Writes out a manifest file from a series of lists of filenames
 28 |     """
 29 | 
 30 |     with open(output_file, "w") as fid:
 31 |         fid.write("@{}\n".format("\t".join(["FILE"] * len(filenames))))
 32 |         for line in zip(*filenames):
 33 |             fid.write("\t".join(line) + "\n")
 34 | 
 35 |     return True
 36 | 
 37 | 
 38 | def main(input_directory, transcript_directory, manifest_file):
 39 |     """ Finds all .flac files recursively in input_directory, then extracts the 
 40 |     transcript from the nearby .trans.txt file and stores it in
 41 |     transcript_directory. Writes a manifest file referring to each .flac file
 42 |     and its paired transcript.
 43 |     
 44 |     Arguments:
 45 |         input_directory (string): Path to librispeech directory
 46 |         transcript_directory (string): Path to directory in which to write
 47 |             individual transcript files.
 48 |         manifest_file (string): Path to manifest file to output.
 49 |     """
 50 | 
 51 |     def librispeech_flac_filename(filestr):
 52 |         parts = filestr.split("-")
 53 | 
 54 |         return os.path.join(input_directory, parts[0], parts[1],
 55 |                             "{}.flac".format(filestr))
 56 | 
 57 |     if not os.path.isdir(input_directory):
 58 |         raise IOError("Data directory does not exist! {}".format(input_directory))
 59 | 
 60 |     if not os.path.exists(transcript_directory):
 61 |         os.makedirs(transcript_directory)
 62 | 
 63 |     transcript_files = glob.glob(os.path.join(input_directory, '*/*/*.txt'))
 64 |     if len(transcript_files) == 0:
 65 |         logger.error("No .txt files were found in {}".format(input_directory))
 66 |         return
 67 | 
 68 |     logger.info("Beginning audio conversions")
 69 |     audio_files = list()
 70 |     txt_files = list()
 71 |     for ii, tfile in enumerate(transcript_files):
 72 |         # transcript file specifies transcript and flac filename for all librispeech files
 73 |         logger.info("Converting audio corresponding to transcript "
 74 |                     "{} of {}".format(ii, len(transcript_files)))
 75 |         with open(tfile, "r") as fid:
 76 |             lines = fid.readlines()
 77 | 
 78 |         for line in lines:
 79 |             filestr, transcript = line.split(" ", 1)
 80 |             try:
 81 |                 flac_file = librispeech_flac_filename(filestr)
 82 |             except IndexError: # filestr is not the format we are expecting
 83 |                 print("filestr of unexpected formatting: {}".format(filestr))
 84 |                 print("error in {}".format(tfile))
 85 |                 continue
 86 |             txt_file = os.path.join(transcript_directory,
 87 |                                     "{}.txt".format(filestr))
 88 |             
 89 |             # Write out short transcript file
 90 |             with open(txt_file, "w") as fid:
 91 |                 fid.write(transcript.strip())
 92 | 
 93 |             # Add to output lists to be written to manifest
 94 |             audio_files.append(flac_file)
 95 |             txt_files.append(txt_file)
 96 | 
 97 |     logger.info("Writing manifest file to {}".format(manifest_file))
 98 |     return write_manifest(manifest_file, audio_files, txt_files)
 99 | 
100 | 
101 | def convert_aeon_manifests(old_manifest, new_manifest):
102 |     """Convert an Aeon < 1.0 manifest to an Aeon >= 1.0 manifest"""
103 |     try:
104 |         with open(old_manifest, "r") as old:
105 |             with open(new_manifest, "w") as new:
106 |                 lines = old.readlines()
107 |                 nfields = len(lines[0].split(","))
108 |                 new.write("@{}\n".format("\t".join(["FILE"] * nfields)))
109 |                 for line in lines:
110 |                     new_line = "\t".join(line.strip().split(","))
111 |                     new.write("{}\n".format(new_line))
112 |     except:
113 |         return False
114 |     return True
115 | 
116 | 
117 | if __name__ == "__main__":
118 |     import argparse
119 |     parser = argparse.ArgumentParser()
120 |     parser.add_argument("input_directory",
121 |                         help="Directory containing librispeech flac files")
122 |     parser.add_argument("transcript_directory",
123 |                         help="Directory to write transcript .txt files")
124 |     parser.add_argument("manifest_file",
125 |                         help="Output file that specifies the filename for each"
126 |                         " output audio and transcript")
127 | 
128 |     args = parser.parse_args()
129 |     main(args.input_directory,
130 |          args.transcript_directory,
131 |          args.manifest_file)
132 | 


--------------------------------------------------------------------------------
/speech/decoder.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # ----------------------------------------------------------------------------
  3 | # Copyright 2015-2016 Nervana Systems Inc.
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ----------------------------------------------------------------------------
 16 | 
 17 | import numpy as np
 18 | import Levenshtein as Lev
 19 | 
 20 | 
 21 | class Decoder(object):
 22 |     """
 23 |     Basic decoder class from which all other decoders inherit. Implements several
 24 |     helper functions. Subclasses should implement the decode() method.
 25 | 
 26 |     Arguments:
 27 |         alphabet (string): mapping from integers to characters.
 28 |         blank_index (int, optional): index for the blank '_' character. Defaults to 0.
 29 |         space_index (int, optional): index for the space ' ' character. Defaults to 28.
 30 |     """
 31 | 
 32 |     def __init__(self, alphabet, blank_index=0, space_index=1):
 33 |         # e.g. alphabet = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ#"
 34 |         self.alphabet = alphabet
 35 |         self.int_to_char = dict([(i, c) for (i, c) in enumerate(alphabet)])
 36 |         self.blank_index = blank_index
 37 |         self.space_index = space_index
 38 | 
 39 |     def convert_to_string(self, sequence):
 40 |         "Given a numeric sequence, returns the corresponding string"
 41 | 
 42 |         return ''.join([self.int_to_char[i] for i in sequence])
 43 | 
 44 |     def process_string(self, sequence, remove_repetitions=False):
 45 |         """
 46 |         Given a string, removes blanks and replace space character with space.
 47 |         Option to remove repetitions (e.g. 'abbca' -> 'abca').
 48 | 
 49 |         Arguments:
 50 |             sequence (array of int): 1-d array of integers
 51 |             remove_repetitions (boolean, optional): If true, repeating characters
 52 |                 are removed. Defaults to False.
 53 |         """
 54 |         string = ''
 55 | 
 56 |         for i, char in enumerate(sequence):
 57 |             if(char != self.int_to_char[self.blank_index]):
 58 |                 # if this char is a repetition and remove_repetitions=true,
 59 |                 # skip.
 60 |                 if(remove_repetitions and i != 0 and char == sequence[i - 1]):
 61 |                     pass
 62 |                 elif(char == self.alphabet[self.space_index]):
 63 |                     string = string + ' '
 64 |                 else:
 65 |                     string = string + char
 66 | 
 67 |         return string
 68 | 
 69 |     def log_sum(self, list_of_probs):
 70 |         """
 71 |         Computes the sum of log-probabilities.
 72 | 
 73 |         Arguments:
 74 |             list_of_probs (iterable): list of log-probabilities
 75 |         """
 76 |         return np.log(np.sum([np.exp(p) for p in list_of_probs]))
 77 | 
 78 |     def wer(self, s1, s2):
 79 |         """
 80 |         Computes the Word Error Rate, defined as the edit distance between the
 81 |         two provided sentences after tokenizing to words.
 82 |         Arguments:
 83 |             s1 (string): space-separated sentence
 84 |             s2 (string): space-separated sentence
 85 |         """
 86 | 
 87 |         # build mapping of words to integers
 88 |         b = set(s1.split() + s2.split())
 89 |         word2char = dict(zip(b, range(len(b))))
 90 | 
 91 |         # map the words to a char array (Levenshtein packages only accepts
 92 |         # strings)
 93 |         w1 = [chr(word2char[w]) for w in s1.split()]
 94 |         w2 = [chr(word2char[w]) for w in s2.split()]
 95 | 
 96 |         return Lev.distance(''.join(w1), ''.join(w2))
 97 | 
 98 |     def cer(self, s1, s2):
 99 |         """
100 |         Computes the Character Error Rate, defined as the edit distance.
101 | 
102 |         Arguments:
103 |             s1 (string): space-separated sentence
104 |             s2 (string): space-separated sentence
105 |         """
106 |         return Lev.distance(s1, s2)
107 | 
108 |     def decode(self, probs):
109 |         """
110 |         Given a matrix of character probabilities, returns the decoder's
111 |         best guess of the transcription
112 | 
113 |         Arguments:
114 |             probs (ndarray): Matrix of character probabilities, where probs[c,t]
115 |                             is the probability of character c at time t
116 |         Returns:
117 |             string: sequence of the model's best guess for the transcription
118 | 
119 |         """
120 |         raise NotImplementedError
121 | 
122 | 
123 | class ArgMaxDecoder(Decoder):
124 | 
125 |     def decode(self, probs):
126 |         """
127 |         Returns the argmax decoding given the probability matrix. Removes
128 |         repeated elements in the sequence, as well as blanks.
129 |         """
130 |         string = self.convert_to_string(np.argmax(probs, axis=0))
131 |         return self.process_string(string, remove_repetitions=True)


--------------------------------------------------------------------------------
/speech/evaluate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # ----------------------------------------------------------------------------
 3 | # Copyright 2015-2016 Nervana Systems Inc.
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ----------------------------------------------------------------------------
16 | 
17 | import os
18 | import numpy as np
19 | import pickle as pkl
20 | 
21 | from neon.backends import gen_backend
22 | from neon.util.argparser import NeonArgparser, extract_valid_args
23 | from neon.models import Model
24 | 
25 | from decoder import ArgMaxDecoder
26 | from utils import get_wer
27 | 
28 | from data.dataloader import make_loader
29 | 
30 | # Parse the command line arguments
31 | arg_defaults = {'batch_size': 32}
32 | parser = NeonArgparser(__doc__, default_overrides=arg_defaults)
33 | parser.add_argument('--use_wer', action="store_true",
34 |                     help='compute wer instead of cer.')
35 | parser.add_argument('--inference_file', default=None,
36 |                     help='saves results in inference_file.')
37 | parser.add_argument('--print_examples', action="store_true",
38 |                     help='print an example transcript for each batch')
39 | args = parser.parse_args()
40 | 
41 | if args.model_file is None:
42 |     raise ArgumentError("A model file is required for evaluation")
43 | 
44 | if "val" not in args.manifest:
45 |     raise ArgumentError("Please provide an argument of the form:\n" + \
46 |                         "--manifest val:/path/to/validation/manifest.csv")
47 | 
48 | # Setup parameters for argmax decoder
49 | alphabet = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ "
50 | nout = len(alphabet)
51 | argmax_decoder = ArgMaxDecoder(alphabet, space_index=alphabet.index(" "))
52 | 
53 | # Initialize our backend
54 | be = gen_backend(**extract_valid_args(args, gen_backend))
55 | 
56 | # Setup dataloader
57 | eval_manifest = args.manifest['val']
58 | if not os.path.exists(eval_manifest):
59 |     raise IOError("Manifest file {} not found".format(eval_manifest))
60 | 
61 | # Setup required dataloader parameters
62 | nbands = 13
63 | max_utt_len = 30
64 | max_tscrpt_len = 1300
65 | eval_set = make_loader(eval_manifest, alphabet, nbands, max_tscrpt_len,
66 |                        max_utt_len, backend_obj=be)
67 | 
68 | # Load the model
69 | model = Model(args.model_file)
70 | 
71 | # Process data and compute stats
72 | wer, sample_size, results = get_wer(model, be, eval_set, argmax_decoder, nout,
73 |                                     use_wer=args.use_wer, print_examples=args.print_examples)
74 | 
75 | print("\n" + "-" * 80)
76 | if args.use_wer:
77 |     print("wer = {}".format(wer))
78 | else:
79 |     print("cer = {}".format(wer))
80 | print("-" * 80 + "\n")
81 | 
82 | if args.inference_file:
83 |     # Save results in args.inference_file
84 |     with open(args.inference_file, 'wb') as f:
85 |         pkl.dump((results, wer), f)
86 |     print("Saved inference results to {}".format(args.inference_file))
87 | 


--------------------------------------------------------------------------------
/speech/sample_proposals_callback.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # ----------------------------------------------------------------------------
 3 | # Copyright 2015-2016 Nervana Systems Inc.
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ----------------------------------------------------------------------------
16 | import numpy as np
17 | import sys
18 | from neon.callbacks.callbacks import Callback
19 | 
20 | 
21 | class WordErrorRateCallback(Callback):
22 | 
23 |     def __init__(self, eval_set, decoder, max_s, noise_label=None, epoch_freq=1):
24 |         super(WordErrorRateCallback, self).__init__(epoch_freq=epoch_freq)
25 | 
26 |         self.eval_set = eval_set
27 |         self.nout = len(decoder.alphabet)
28 |         self.decoder = decoder
29 |         if noise_label is None:
30 |             self.noise_label = ''
31 |         else:
32 |             self.noise_label = noise_label
33 | 
34 |     def decrypt(self, decoder, message, noise_label):
35 |         msg = decoder.convert_to_string(message)
36 |         return decoder.process_string(msg, remove_repetitions=False
37 |                                       ).replace(noise_label, '')
38 | 
39 |     def softmax(self, x):
40 |         return (np.reciprocal(np.sum(
41 |                 np.exp(x - np.max(x, axis=0)), axis=0)) *
42 |                 np.exp(x - np.max(x, axis=0)))
43 | 
44 |     def dev_to_host(self, dev_tensor):
45 |         if self.be.distribute_data(dev_tensor, "Disabled"):
46 |             revert = True
47 |         else:
48 |             revert = False
49 |         host_tensor = dev_tensor.get()
50 |         if revert:
51 |             self.be.revert_tensor(dev_tensor)
52 |         return host_tensor
53 | 
54 |     def get_outputs(self, model, inputs):
55 |         outputs = model.fprop(inputs, inference=True)
56 |         return self.softmax(self.dev_to_host(outputs)).reshape(
57 |             (self.nout, -1, self.be.bsz)).transpose((2, 0, 1))
58 | 
59 |     def get_wer(self, model, dataset, noise_symbol=None):
60 |         if noise_symbol is None:
61 |             noise_symbol = ''
62 |         cer = 0
63 |         batch_count = 1e-10
64 |         for x, y in dataset:
65 |             batch_count += 1
66 |             probs = self.get_outputs(model, x)
67 |             strided_tmax = probs.shape[-1]
68 |             flat_labels = self.dev_to_host(y[0])[0,:]
69 |             tscrpt_lens = self.dev_to_host(y[1])[0, :]
70 |             utt_lens = strided_tmax * self.dev_to_host(y[2])[0, :] / 100
71 |             disp_indx = np.random.randint(self.be.bsz)
72 |             for mu in range(self.be.bsz):
73 |                 prediction = self.decoder.decode(probs[mu, :, :int(utt_lens[mu])])
74 |                 start = int(np.sum(tscrpt_lens[:mu]))
75 |                 target = flat_labels[start:start + tscrpt_lens[mu]].tolist()
76 |                 target = self.decrypt(self.decoder, target, noise_symbol)
77 |                 cer += self.decoder.cer(prediction, target) / (1.0 * len(target))
78 | 
79 |                 if mu == disp_indx:
80 |                     disp_proposal = prediction
81 |                     disp_target = target
82 |         return cer / (batch_count * self.be.bsz), disp_proposal, disp_target
83 | 
84 |     def on_epoch_end(self, callback_data, model, epoch):
85 |         cer, disp_proposal, disp_target = self.get_wer(model, self.eval_set)
86 |         print(u"Proposal: {}".format(disp_proposal))
87 |         print(u"Target: {}".format(disp_target))
88 |         print("CER (validation) = {}".format(cer))
89 | 


--------------------------------------------------------------------------------
/speech/train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # ----------------------------------------------------------------------------
  3 | # Copyright 2015-2016 Nervana Systems Inc.
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #      http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | # ----------------------------------------------------------------------------
 16 | """
 17 | Train ds2-style speech model on Librispeech
 18 | """
 19 | 
 20 | import os
 21 | import numpy as np
 22 | 
 23 | from neon.backends import gen_backend
 24 | from neon.callbacks.callbacks import Callbacks
 25 | from neon.initializers import GlorotUniform, Constant, Gaussian
 26 | from neon.layers import Conv, GeneralizedCost, Affine, DeepBiRNN
 27 | from neon.models import Model
 28 | from neon.transforms import Rectlin, Identity, Rectlinclip
 29 | from neon.optimizers import GradientDescentMomentum
 30 | from neon.util.argparser import NeonArgparser, extract_valid_args
 31 | 
 32 | from ctc import CTC
 33 | from decoder import ArgMaxDecoder
 34 | from sample_proposals_callback import WordErrorRateCallback
 35 | from data.dataloader import make_loader
 36 | 
 37 | # Parse the command line arguments
 38 | arg_defaults = {'batch_size': 32}
 39 | 
 40 | parser = NeonArgparser(__doc__, default_overrides=arg_defaults)
 41 | parser.add_argument('--nfilters', type=int,
 42 |                     help='no. of conv filters', default=1152)
 43 | parser.add_argument('--filter_width', type=int,
 44 |                     help='width of conv filter', default=11)
 45 | parser.add_argument('--str_w', type=int, help='stride in time', default=3)
 46 | parser.add_argument('--depth', type=int, help='rnn depth', default=9)
 47 | parser.add_argument('--hidden_size', type=int,
 48 |                     help='affine/rnn hidden units', default=1152)
 49 | parser.add_argument('--lr', type=float,
 50 |                     help='learning rate', default=2e-5)
 51 | parser.add_argument('--momentum', type=float,
 52 |                     help='momentum', default=0.99)
 53 | args = parser.parse_args()
 54 | 
 55 | # Setup model hyperparameters
 56 | # Convolution layer hyperparameters
 57 | nfilters = args.nfilters  # Number of convolutional filters
 58 | filter_width = args.filter_width  # Width of convolutional filters
 59 | str_w = args.str_w  # Convolutional filter stride
 60 | 
 61 | # RNN hyperparameters
 62 | depth = args.depth  # Number of BiRNN layers
 63 | hidden_size = args.hidden_size # Number of units in each BiRNN layer
 64 | 
 65 | # Optimization hyperparameters
 66 | learning_rate = args.lr
 67 | momentum = args.momentum
 68 | gradient_clip_norm = 400
 69 | 
 70 | # Setup parameters for argmax decoder
 71 | alphabet = "_'ABCDEFGHIJKLMNOPQRSTUVWXYZ "
 72 | nout = len(alphabet)
 73 | argmax_decoder = ArgMaxDecoder(alphabet, space_index=alphabet.index(" "))
 74 | 
 75 | # Initialize our backend
 76 | be = gen_backend(**extract_valid_args(args, gen_backend))
 77 | 
 78 | # Setup dataloader
 79 | nbands = 13
 80 | max_tscrpt_len = 1300
 81 | max_utt_len = 30
 82 | 
 83 | train_manifest = args.manifest['train']
 84 | if not os.path.exists(train_manifest):
 85 |     raise RuntimeError(
 86 |         "training manifest file {} not found".format(train_manifest))
 87 | dev_manifest = args.manifest['val']
 88 | if not os.path.exists(dev_manifest):
 89 |     raise RuntimeError(
 90 |         "validation manifest file {} not found".format(dev_manifest))
 91 | 
 92 | train = make_loader(train_manifest, alphabet, nbands, max_tscrpt_len, max_utt_len, backend_obj=be)
 93 | dev = make_loader(dev_manifest, alphabet, nbands, max_tscrpt_len, max_utt_len, backend_obj=be)
 94 | 
 95 | # Setup the layers of the DNN
 96 | # Softmax is performed in warp-ctc, so we use an Identity activation in the
 97 | # final layer.
 98 | gauss = Gaussian(scale=0.01)
 99 | glorot = GlorotUniform()
100 | layers = [
101 |     Conv(
102 |         (nbands,
103 |          filter_width,
104 |          nfilters),
105 |         init=gauss,
106 |         bias=Constant(0),
107 |         activation=Rectlin(),
108 |         padding=dict(
109 |             pad_h=0,
110 |             pad_w=5),
111 |         strides=dict(
112 |             str_h=1,
113 |             str_w=str_w)),
114 |     DeepBiRNN(
115 |         hidden_size,
116 |         init=glorot,
117 |         activation=Rectlinclip(),
118 |         batch_norm=True,
119 |         reset_cells=True,
120 |         depth=depth),
121 |     Affine(
122 |         hidden_size,
123 |         init=glorot,
124 |         activation=Rectlinclip()),
125 |     Affine(
126 |         nout=nout,
127 |         init=glorot,
128 |         activation=Identity())]
129 | 
130 | model = Model(layers=layers)
131 | 
132 | opt = GradientDescentMomentum(learning_rate, momentum,
133 |                               gradient_clip_norm=gradient_clip_norm,
134 |                               stochastic_round=False,
135 |                               nesterov=True)
136 | callbacks = Callbacks(model, eval_set=dev, **args.callback_args)
137 | 
138 | # Print validation set word error rate at the end of every epoch
139 | pcb = WordErrorRateCallback(dev, argmax_decoder, max_tscrpt_len, epoch_freq=1)
140 | callbacks.add_callback(pcb)
141 | 
142 | cost = GeneralizedCost(costfunc=CTC(max_tscrpt_len, nout=nout))
143 | 
144 | # Fit the model
145 | model.fit(train, optimizer=opt, num_epochs=args.epochs,
146 |           cost=cost, callbacks=callbacks)
147 | 


--------------------------------------------------------------------------------
/speech/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # ----------------------------------------------------------------------------
 3 | # Copyright 2015-2016 Nervana Systems Inc.
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ----------------------------------------------------------------------------
16 | 
17 | import sys
18 | import numpy as np
19 | from tqdm import tqdm
20 | 
21 | 
22 | def softmax(x):
23 |     return (np.reciprocal(np.sum(
24 |             np.exp(x - np.max(x, axis=0)), axis=0)) *
25 |             np.exp(x - np.max(x, axis=0)))
26 | 
27 | def get_outputs(model, be, inputs, nout):
28 |     outputs = model.fprop(inputs, inference=False)
29 |     return softmax(outputs.get()).reshape(
30 |         (nout, -1, be.bsz)).transpose((2, 0, 1))
31 | 
32 | def eval_model(model, dataset, nout, bsz):
33 |     return [((get_outputs(model, x, nout, bsz),
34 |               y[0]), y[2]) for (x, y) in dataset]
35 | 
36 | def decrypt(decoder, message):
37 |     msg = decoder.convert_to_string(message)
38 |     return decoder.process_string(msg, remove_repetitions=False)
39 | 
40 | def get_wer(model, be, dataset, decoder, nout, use_wer=False, print_examples=False):
41 |     wer = 0
42 |     batchcount = 0
43 |     predictions = list()
44 |     targets = list()
45 |     nbatches = dataset.nbatches
46 | 
47 |     if not model.initialized:
48 |         model.initialize(dataset)
49 |     
50 |     progress_bar = tqdm(dataset, total=nbatches, unit="batches")
51 |     for x, y in progress_bar:
52 |         probs = get_outputs(model, be, x, nout)
53 |         strided_tmax = probs.shape[-1]
54 |         flat_labels = y[0].get().ravel()
55 |         tscrpt_lens = y[1].get().ravel()
56 |         utt_lens = strided_tmax * y[2].get().ravel() / 100
57 |         for mu in range(be.bsz):
58 |             prediction = decoder.decode(probs[mu, :, :int(utt_lens[mu])])
59 |             start = int(np.sum(tscrpt_lens[:mu]))
60 |             target = flat_labels[start:start + tscrpt_lens[mu]].tolist()
61 |             target = decrypt(decoder, target)
62 |             predictions.append(prediction)
63 |             targets.append(target)
64 |             if not use_wer:
65 |                 wer += decoder.cer(prediction, target) / float(len(target))
66 |             else:
67 |                 wer += decoder.wer(prediction, target) / \
68 |                         float(len(target.split()))
69 |         
70 |         if use_wer:
71 |             progress_bar.set_description("WER: {}".format(wer / len(predictions)))
72 |         else:
73 |             progress_bar.set_description("CER: {}".format(wer / len(predictions)))
74 |         if print_examples is True:
75 |             progress_bar.write("Transcribed: {}".format(predictions[-1]))
76 |             progress_bar.write("Target:      {}".format(targets[-1]))
77 | 
78 |     results = zip(predictions, targets)
79 |     nsamples = len(predictions)
80 |     return wer / nsamples, nsamples , results
81 | 


--------------------------------------------------------------------------------
/src/transforms/.gitignore:
--------------------------------------------------------------------------------
1 | warp-ctc/build


--------------------------------------------------------------------------------
/src/transforms/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: default clean
 2 | 
 3 | default: libwarpctc.so
 4 | 
 5 | OS := $(shell uname)
 6 | 
 7 | clean:
 8 | 	@rm -rf warp-ctc/build
 9 | 	@rm -rf libwarpctc.so
10 | 
11 | libwarpctc.so: warp-ctc/build/libwarpctc.so
12 | 	@ln -sf warp-ctc/build/libwarpctc.so libwarpctc.so
13 | 
14 | 
15 | warp-ctc/build/libwarpctc.so:
16 | 	@rm -rf warp-ctc/build
17 | 	@mkdir warp-ctc/build
18 | 	@cd warp-ctc/build && cmake .. && make
19 | ifeq ($(OS),Darwin)
20 | 	@cd warp-ctc/build && ln -sf libwarpctc.dylib libwarpctc.so
21 | endif
22 | 
23 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | IF (APPLE)
 2 |     cmake_minimum_required(VERSION 3.4)
 3 | ELSE()
 4 |     cmake_minimum_required(VERSION 2.8)
 5 | ENDIF()
 6 | 
 7 | project(ctc_release)
 8 | 
 9 | IF (NOT APPLE)
10 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -O2 -g")
11 | ENDIF()
12 | 
13 | IF (APPLE)
14 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2 -g")
15 |     add_definitions(-DAPPLE)
16 | ENDIF()
17 | 
18 | include_directories(include)
19 | 
20 | FIND_PACKAGE(CUDA 6.5)
21 | MESSAGE(STATUS "cuda found ${CUDA_FOUND}")
22 | 
23 | # need to be at least 30 or __shfl_down in reduce wont compile
24 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_30,code=sm_30 -O2")
25 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_35,code=sm_35")
26 | 
27 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_50,code=sm_50")
28 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_52,code=sm_52")
29 | 
30 | # https://github.com/baidu-research/warp-ctc/commit/ecc7ed2f65becf8946ebff8c59b7e1eeeef44334
31 | IF (CUDA_VERSION GREATER 7.6) 
32 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_60,code=sm_60")
33 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_61,code=sm_61")
34 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -gencode arch=compute_62,code=sm_62")
35 | ENDIF()
36 | 
37 | if (NOT APPLE)
38 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std=c++11")
39 |     set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -Xcompiler -fopenmp")
40 | ENDIF()
41 | 
42 | IF (APPLE)
43 |     EXEC_PROGRAM(uname ARGS -v  OUTPUT_VARIABLE DARWIN_VERSION)
44 |     STRING(REGEX MATCH "[0-9]+" DARWIN_VERSION ${DARWIN_VERSION})
45 |     MESSAGE(STATUS "DARWIN_VERSION=${DARWIN_VERSION}")
46 | 
47 |     #for el capitain have to use rpath
48 | 
49 |     IF (DARWIN_VERSION LESS 15)
50 |         set(CMAKE_SKIP_RPATH TRUE)
51 |     ENDIF ()
52 | 
53 | ELSE()
54 |     #always skip for linux
55 |     set(CMAKE_SKIP_RPATH TRUE)
56 | ENDIF()
57 | 
58 | 
59 | IF (CUDA_FOUND)
60 | 
61 |     MESSAGE(STATUS "Building shared library with GPU support")
62 | 
63 |     CUDA_ADD_LIBRARY(warpctc SHARED src/ctc_entrypoint.cu src/reduce.cu)
64 |     TARGET_LINK_LIBRARIES(warpctc ${CUDA_curand_LIBRARY})
65 | 
66 |     add_executable(test_cpu tests/test_cpu.cpp )
67 |     TARGET_LINK_LIBRARIES(test_cpu warpctc)
68 |     SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} -g --std=c++11")
69 | 
70 |     cuda_add_executable(test_gpu tests/test_gpu.cu)
71 |     TARGET_LINK_LIBRARIES(test_gpu warpctc ${CUDA_curand_LIBRARY})
72 | 
73 |     INSTALL(TARGETS warpctc
74 |             RUNTIME DESTINATION bin
75 |             LIBRARY DESTINATION lib
76 |             ARCHIVE DESTINATION lib/static)
77 | 
78 | ELSE()
79 |     MESSAGE(STATUS "Building shared library with no GPU support")
80 | 
81 |     if (NOT APPLE)
82 |         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O2")
83 |     ENDIF()
84 | 
85 |     ADD_LIBRARY(warpctc SHARED src/ctc_entrypoint.cpp)
86 | 
87 |     add_executable(test_cpu tests/test_cpu.cpp )
88 |     TARGET_LINK_LIBRARIES(test_cpu warpctc)
89 |     SET_TARGET_PROPERTIES(test_cpu PROPERTIES COMPILE_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")
90 | 
91 |     INSTALL(TARGETS warpctc
92 |             RUNTIME DESTINATION bin
93 |             LIBRARY DESTINATION lib
94 |             ARCHIVE DESTINATION lib/static)
95 | ENDIF()
96 | 
97 | 
98 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/LICENSE:
--------------------------------------------------------------------------------
  1 | Copyright 2015-2016 Baidu USA LLC.  All rights reserved.
  2 | 
  3 |                                  Apache License
  4 |                            Version 2.0, January 2004
  5 |                         http://www.apache.org/licenses/
  6 | 
  7 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  8 | 
  9 |    1. Definitions.
 10 | 
 11 |       "License" shall mean the terms and conditions for use, reproduction,
 12 |       and distribution as defined by Sections 1 through 9 of this document.
 13 | 
 14 |       "Licensor" shall mean the copyright owner or entity authorized by
 15 |       the copyright owner that is granting the License.
 16 | 
 17 |       "Legal Entity" shall mean the union of the acting entity and all
 18 |       other entities that control, are controlled by, or are under common
 19 |       control with that entity. For the purposes of this definition,
 20 |       "control" means (i) the power, direct or indirect, to cause the
 21 |       direction or management of such entity, whether by contract or
 22 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 23 |       outstanding shares, or (iii) beneficial ownership of such entity.
 24 | 
 25 |       "You" (or "Your") shall mean an individual or Legal Entity
 26 |       exercising permissions granted by this License.
 27 | 
 28 |       "Source" form shall mean the preferred form for making modifications,
 29 |       including but not limited to software source code, documentation
 30 |       source, and configuration files.
 31 | 
 32 |       "Object" form shall mean any form resulting from mechanical
 33 |       transformation or translation of a Source form, including but
 34 |       not limited to compiled object code, generated documentation,
 35 |       and conversions to other media types.
 36 | 
 37 |       "Work" shall mean the work of authorship, whether in Source or
 38 |       Object form, made available under the License, as indicated by a
 39 |       copyright notice that is included in or attached to the work
 40 |       (an example is provided in the Appendix below).
 41 | 
 42 |       "Derivative Works" shall mean any work, whether in Source or Object
 43 |       form, that is based on (or derived from) the Work and for which the
 44 |       editorial revisions, annotations, elaborations, or other modifications
 45 |       represent, as a whole, an original work of authorship. For the purposes
 46 |       of this License, Derivative Works shall not include works that remain
 47 |       separable from, or merely link (or bind by name) to the interfaces of,
 48 |       the Work and Derivative Works thereof.
 49 | 
 50 |       "Contribution" shall mean any work of authorship, including
 51 |       the original version of the Work and any modifications or additions
 52 |       to that Work or Derivative Works thereof, that is intentionally
 53 |       submitted to Licensor for inclusion in the Work by the copyright owner
 54 |       or by an individual or Legal Entity authorized to submit on behalf of
 55 |       the copyright owner. For the purposes of this definition, "submitted"
 56 |       means any form of electronic, verbal, or written communication sent
 57 |       to the Licensor or its representatives, including but not limited to
 58 |       communication on electronic mailing lists, source code control systems,
 59 |       and issue tracking systems that are managed by, or on behalf of, the
 60 |       Licensor for the purpose of discussing and improving the Work, but
 61 |       excluding communication that is conspicuously marked or otherwise
 62 |       designated in writing by the copyright owner as "Not a Contribution."
 63 | 
 64 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 65 |       on behalf of whom a Contribution has been received by Licensor and
 66 |       subsequently incorporated within the Work.
 67 | 
 68 |    2. Grant of Copyright License. Subject to the terms and conditions of
 69 |       this License, each Contributor hereby grants to You a perpetual,
 70 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 71 |       copyright license to reproduce, prepare Derivative Works of,
 72 |       publicly display, publicly perform, sublicense, and distribute the
 73 |       Work and such Derivative Works in Source or Object form.
 74 | 
 75 |    3. Grant of Patent License. Subject to the terms and conditions of
 76 |       this License, each Contributor hereby grants to You a perpetual,
 77 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 78 |       (except as stated in this section) patent license to make, have made,
 79 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 80 |       where such license applies only to those patent claims licensable
 81 |       by such Contributor that are necessarily infringed by their
 82 |       Contribution(s) alone or by combination of their Contribution(s)
 83 |       with the Work to which such Contribution(s) was submitted. If You
 84 |       institute patent litigation against any entity (including a
 85 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 86 |       or a Contribution incorporated within the Work constitutes direct
 87 |       or contributory patent infringement, then any patent licenses
 88 |       granted to You under this License for that Work shall terminate
 89 |       as of the date such litigation is filed.
 90 | 
 91 |    4. Redistribution. You may reproduce and distribute copies of the
 92 |       Work or Derivative Works thereof in any medium, with or without
 93 |       modifications, and in Source or Object form, provided that You
 94 |       meet the following conditions:
 95 | 
 96 |       (a) You must give any other recipients of the Work or
 97 |           Derivative Works a copy of this License; and
 98 | 
 99 |       (b) You must cause any modified files to carry prominent notices
100 |           stating that You changed the files; and
101 | 
102 |       (c) You must retain, in the Source form of any Derivative Works
103 |           that You distribute, all copyright, patent, trademark, and
104 |           attribution notices from the Source form of the Work,
105 |           excluding those notices that do not pertain to any part of
106 |           the Derivative Works; and
107 | 
108 |       (d) If the Work includes a "NOTICE" text file as part of its
109 |           distribution, then any Derivative Works that You distribute must
110 |           include a readable copy of the attribution notices contained
111 |           within such NOTICE file, excluding those notices that do not
112 |           pertain to any part of the Derivative Works, in at least one
113 |           of the following places: within a NOTICE text file distributed
114 |           as part of the Derivative Works; within the Source form or
115 |           documentation, if provided along with the Derivative Works; or,
116 |           within a display generated by the Derivative Works, if and
117 |           wherever such third-party notices normally appear. The contents
118 |           of the NOTICE file are for informational purposes only and
119 |           do not modify the License. You may add Your own attribution
120 |           notices within Derivative Works that You distribute, alongside
121 |           or as an addendum to the NOTICE text from the Work, provided
122 |           that such additional attribution notices cannot be construed
123 |           as modifying the License.
124 | 
125 |       You may add Your own copyright statement to Your modifications and
126 |       may provide additional or different license terms and conditions
127 |       for use, reproduction, or distribution of Your modifications, or
128 |       for any such Derivative Works as a whole, provided Your use,
129 |       reproduction, and distribution of the Work otherwise complies with
130 |       the conditions stated in this License.
131 | 
132 |    5. Submission of Contributions. Unless You explicitly state otherwise,
133 |       any Contribution intentionally submitted for inclusion in the Work
134 |       by You to the Licensor shall be under the terms and conditions of
135 |       this License, without any additional terms or conditions.
136 |       Notwithstanding the above, nothing herein shall supersede or modify
137 |       the terms of any separate license agreement you may have executed
138 |       with Licensor regarding such Contributions.
139 | 
140 |    6. Trademarks. This License does not grant permission to use the trade
141 |       names, trademarks, service marks, or product names of the Licensor,
142 |       except as required for reasonable and customary use in describing the
143 |       origin of the Work and reproducing the content of the NOTICE file.
144 | 
145 |    7. Disclaimer of Warranty. Unless required by applicable law or
146 |       agreed to in writing, Licensor provides the Work (and each
147 |       Contributor provides its Contributions) on an "AS IS" BASIS,
148 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149 |       implied, including, without limitation, any warranties or conditions
150 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151 |       PARTICULAR PURPOSE. You are solely responsible for determining the
152 |       appropriateness of using or redistributing the Work and assume any
153 |       risks associated with Your exercise of permissions under this License.
154 | 
155 |    8. Limitation of Liability. In no event and under no legal theory,
156 |       whether in tort (including negligence), contract, or otherwise,
157 |       unless required by applicable law (such as deliberate and grossly
158 |       negligent acts) or agreed to in writing, shall any Contributor be
159 |       liable to You for damages, including any direct, indirect, special,
160 |       incidental, or consequential damages of any character arising as a
161 |       result of this License or out of the use or inability to use the
162 |       Work (including but not limited to damages for loss of goodwill,
163 |       work stoppage, computer failure or malfunction, or any and all
164 |       other commercial damages or losses), even if such Contributor
165 |       has been advised of the possibility of such damages.
166 | 
167 |    9. Accepting Warranty or Additional Liability. While redistributing
168 |       the Work or Derivative Works thereof, You may choose to offer,
169 |       and charge a fee for, acceptance of support, warranty, indemnity,
170 |       or other liability obligations and/or rights consistent with this
171 |       License. However, in accepting such obligations, You may act only
172 |       on Your own behalf and on Your sole responsibility, not on behalf
173 |       of any other Contributor, and only if You agree to indemnify,
174 |       defend, and hold each Contributor harmless for any liability
175 |       incurred by, or claims asserted against, such Contributor by reason
176 |       of your accepting any such warranty or additional liability.
177 | 
178 |    END OF TERMS AND CONDITIONS
179 | 
180 |    APPENDIX: How to apply the Apache License to your work.
181 | 
182 |       To apply the Apache License to your work, attach the following
183 |       boilerplate notice, with the fields enclosed by brackets "[]"
184 |       replaced with your own identifying information. (Don't include
185 |       the brackets!)  The text should be enclosed in the appropriate
186 |       comment syntax for the file format. We also recommend that a
187 |       file or class name and description of purpose be included on the
188 |       same "printed page" as the copyright notice for easier
189 |       identification within third-party archives.
190 | 
191 |    Copyright 2015-2016, Baidu USA LLC.
192 | 
193 |    Licensed under the Apache License, Version 2.0 (the "License");
194 |    you may not use this file except in compliance with the License.
195 |    You may obtain a copy of the License at
196 | 
197 |        http://www.apache.org/licenses/LICENSE-2.0
198 | 
199 |    Unless required by applicable law or agreed to in writing, software
200 |    distributed under the License is distributed on an "AS IS" BASIS,
201 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202 |    See the License for the specific language governing permissions and
203 |    limitations under the License.


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/README.md:
--------------------------------------------------------------------------------
 1 | # Instructions
 2 | 
 3 | Install warp-ctc first using cmake
 4 | 
 5 | - `mkdir build`
 6 | - `cd build`
 7 | - `cmake ..`
 8 | - `make`
 9 | 
10 | Now, go to the python directory and run `sudo python setup.py install`
11 | 
12 | # Credits
13 | 
14 | https://github.com/baidu-research/warp-ctc
15 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/doc/baidu-research-logo-small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NervanaSystems/deepspeech/2ea95f5ce1bb39fa4de26807beed905b7889de59/src/transforms/warp-ctc/doc/baidu-research-logo-small.png


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/doc/deep-speech-ctc-small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NervanaSystems/deepspeech/2ea95f5ce1bb39fa4de26807beed905b7889de59/src/transforms/warp-ctc/doc/deep-speech-ctc-small.png


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/examples/loader.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | 
 4 | class DataLoader:
 5 |     def __init__(self, mbsz=128, min_len=20, max_len=30, num_classes=29):
 6 |         self.mbsz = mbsz
 7 |         self.min_len = min_len
 8 |         self.max_len = max_len
 9 |         self.num_classes = num_classes
10 | 
11 |     def sample(self):
12 |         inputs = []
13 |         input_lens = []
14 |         outputs = []
15 |         output_lens = []
16 |         for i in xrange(self.mbsz):
17 |             length = random.randint(self.min_len, self.max_len)
18 |             input_lens.append(length)
19 |             input = [random.randint(1, self.num_classes-1) for j in xrange(length)]
20 |             #output = input[:] # identity output
21 |             output = input[::4] # every 4th input is output
22 |             """
23 |             # for acronym output
24 |             output = []
25 |             flag = True
26 |             for j in xrange(len(input)):
27 |                 if input[j] == 1:
28 |                     flag = True
29 |                 elif flag == True:
30 |                     flag = False
31 |                     output.append(input[j])
32 |             """
33 |             output_lens.append(len(output))
34 |             inputs.append(input)
35 |             outputs.append(output)
36 | 
37 |         input_arr = np.zeros((self.mbsz, self.max_len, self.num_classes))
38 |         for i in xrange(self.mbsz):
39 |             for j in xrange(len(inputs[i])):
40 |                 input_arr[i, j, inputs[i][j]] = 1.0
41 |         label_arr = np.zeros((sum(output_lens)), dtype=np.int32)
42 |         pos = 0
43 |         for i in xrange(self.mbsz):
44 |             label_arr[pos:pos+output_lens[i]] = outputs[i]
45 |             pos += output_lens[i]
46 | 
47 |         return input_arr, np.array(input_lens, dtype=np.int32), label_arr, np.array(output_lens, dtype=np.int32)
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     dl = DataLoader()
52 |     ret = dl.sample()
53 |     print ret[0].shape
54 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/examples/rnnctc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import theano
 3 | import theano.tensor as T
 4 | import lasagne
 5 | 
 6 | import ctc
 7 | 
 8 | num_classes = 5
 9 | mbsz = 1
10 | min_len = 12
11 | max_len = 12
12 | n_hidden = 100
13 | grad_clip = 100
14 | 
15 | input_lens = T.ivector('input_lens')
16 | output = T.ivector('output')
17 | output_lens = T.ivector('output_lens')
18 | 
19 | l_in = lasagne.layers.InputLayer(shape=(mbsz, max_len, num_classes))
20 | 
21 | h1f = lasagne.layers.RecurrentLayer(l_in, n_hidden, grad_clipping=grad_clip,
22 |         nonlinearity=lasagne.nonlinearities.rectify)
23 | h1b = lasagne.layers.RecurrentLayer(l_in, n_hidden, grad_clipping=grad_clip,
24 |         nonlinearity=lasagne.nonlinearities.rectify, backwards = True)
25 | h1 = lasagne.layers.ElemwiseSumLayer([h1f, h1b])
26 | 
27 | h2f = lasagne.layers.RecurrentLayer(h1, n_hidden, grad_clipping=grad_clip,
28 |         nonlinearity=lasagne.nonlinearities.rectify)
29 | h2b = lasagne.layers.RecurrentLayer(h1, n_hidden, grad_clipping=grad_clip,
30 |         nonlinearity=lasagne.nonlinearities.rectify, backwards = True)
31 | h2 = lasagne.layers.ElemwiseSumLayer([h2f, h2b])
32 | 
33 | h3 = lasagne.layers.RecurrentLayer(h2, num_classes, grad_clipping=grad_clip,
34 |         nonlinearity=lasagne.nonlinearities.linear)
35 | # Turn <minibatch_size, max_length, num_classes> into <max_length, minibatch_size, num_classes>
36 | l_out = lasagne.layers.DimshuffleLayer(h3, (1, 0, 2))
37 | 
38 | network_output = lasagne.layers.get_output(l_out)
39 | 
40 | cost = T.mean(ctc.cpu_ctc_th(network_output, input_lens, output, output_lens))
41 | grads = T.grad(cost, wrt=network_output)
42 | all_params = lasagne.layers.get_all_params(l_out)
43 | updates = lasagne.updates.adam(cost, all_params, 0.001)
44 | 
45 | train = theano.function([l_in.input_var, input_lens, output, output_lens], cost, updates=updates)
46 | predict = theano.function([l_in.input_var], network_output)
47 | get_grad = theano.function([l_in.input_var, input_lens, output, output_lens], grads)
48 | 
49 | from loader import DataLoader
50 | data_loader = DataLoader(mbsz=mbsz, min_len=min_len, max_len=max_len, num_classes=num_classes)
51 | 
52 | i = 1
53 | while True:
54 |     i += 1
55 |     print i
56 |     sample = data_loader.sample()
57 |     cost = train(*sample)
58 |     out = predict(sample[0])
59 |     print cost
60 |     print "input", sample[0][0].argmax(1)
61 |     print "prediction", out[:, 0].argmax(1)
62 |     print "expected", sample[2][:sample[3][0]]
63 |     if i == 10000:
64 |         grads = get_grad(*sample)
65 |         import ipdb; ipdb.set_trace()
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/examples/simple.py:
--------------------------------------------------------------------------------
 1 | from ctc import cpu_ctc_th, cpu_ctc_np
 2 | import numpy as np
 3 | import theano
 4 | import theano.tensor as T
 5 | 
 6 | acts = np.array([[[0.1, 0.6, 0.1, 0.1, 0.1]],
 7 |                  [[0.1, 0.1, 0.6, 0.1, 0.1]]])
 8 | 
 9 | labels = np.array([1, 2])
10 | label_lens = np.array([2])
11 | act_lens = np.array([2])
12 | cost, grads = cpu_ctc_np(acts, act_lens, labels, label_lens)
13 | print "expected cost:", 2.46285844
14 | 
15 | print "cost (numpy):", cost.sum()
16 | print "grads (numpy):", grads
17 | 
18 | def create_theano_func():
19 |     acts = T.ftensor3()
20 |     act_lens = T.ivector()
21 |     labels = T.ivector()
22 |     label_lens = T.ivector()
23 |     costs = cpu_ctc_th(acts, act_lens, labels, label_lens)
24 |     cost = T.mean(costs)
25 |     grads = T.grad(cost, acts)
26 |     f = theano.function([acts, act_lens, labels, label_lens], cost, allow_input_downcast=True)
27 |     g = theano.function([acts, act_lens, labels, label_lens], grads, allow_input_downcast=True)
28 |     return f, g
29 | 
30 | f, g = create_theano_func()
31 | print "cost (theano):", f(acts, act_lens, labels, label_lens).sum()
32 | print "grads (theano)", g(acts, act_lens, labels, label_lens)
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/include/contrib/moderngpu/LICENSE:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 | * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 3 | * 
 4 | * Redistribution and use in source and binary forms, with or without
 5 | * modification, are permitted provided that the following conditions are met:
 6 | *     * Redistributions of source code must retain the above copyright
 7 | *       notice, this list of conditions and the following disclaimer.
 8 | *     * Redistributions in binary form must reproduce the above copyright
 9 | *       notice, this list of conditions and the following disclaimer in the
10 | *       documentation and/or other materials provided with the distribution.
11 | *     * Neither the name of the NVIDIA CORPORATION nor the
12 | *       names of its contributors may be used to endorse or promote products
13 | *       derived from this software without specific prior written permission.
14 | * 
15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
16 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
18 | * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 | * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | *
26 | ******************************************************************************/
27 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/include/contrib/moderngpu/include/device/ctaloadbalance.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "ctasearch.cuh"
 38 | #include "loadstore.cuh"
 39 | 
 40 | namespace mgpu {
 41 | 
 42 | ////////////////////////////////////////////////////////////////////////////////
 43 | // DeviceLoadBalancingSearch
 44 | // Upper Bound search from A (needles) into B (haystack). The A values are
 45 | // natural numbers from aBegin to aEnd. bFirst is the index of the B value at
 46 | // bBegin in shared memory.
 47 | 
 48 | template<int VT, bool RangeCheck>
 49 | MGPU_DEVICE void DeviceSerialLoadBalanceSearch(const int* b_shared, int aBegin,
 50 | 	int aEnd, int bFirst, int bBegin, int bEnd, int* a_shared) {
 51 | 
 52 | 	int bKey = b_shared[bBegin];
 53 | 
 54 | 	#pragma unroll
 55 | 	for(int i = 0; i < VT; ++i) {
 56 | 		bool p;
 57 | 		if(RangeCheck)
 58 | 			p = (aBegin < aEnd) && ((bBegin >= bEnd) || (aBegin < bKey));
 59 | 		else
 60 | 			p = aBegin < bKey;
 61 | 
 62 | 		if(p)
 63 | 			// Advance A (the needle).
 64 | 			a_shared[aBegin++] = bFirst + bBegin;
 65 | 		else
 66 | 			// Advance B (the haystack).
 67 | 			bKey = b_shared[++bBegin];
 68 | 	}
 69 | }
 70 | 
 71 | ////////////////////////////////////////////////////////////////////////////////
 72 | // CTALoadBalance
 73 | // Computes upper_bound(counting_iterator<int>(first), b_global) - 1.
 74 | 
 75 | // Unlike most other CTA* functions, CTALoadBalance loads from global memory.
 76 | // This returns the loaded B elements at the beginning or end of shared memory
 77 | // depending on the aFirst argument.
 78 | 
 79 | // CTALoadBalance requires NT * VT + 2 slots of shared memory.
 80 | template<int NT, int VT, typename InputIt>
 81 | MGPU_DEVICE int4 CTALoadBalance(int destCount, InputIt b_global,
 82 | 	int sourceCount, int block, int tid, const int* mp_global,
 83 | 	int* indices_shared, bool loadPrecedingB) {
 84 | 
 85 | 	int4 range = ComputeMergeRange(destCount, sourceCount, block, 0, NT * VT,
 86 | 		mp_global);
 87 | 
 88 | 	int a0 = range.x;
 89 | 	int a1 = range.y;
 90 | 	int b0 = range.z;
 91 | 	int b1 = range.w;
 92 | 	if(!b0) loadPrecedingB = false;
 93 | 
 94 | 	// Load one trailing term from B. If we're already at the end, fill the
 95 | 	// end of the buffer with destCount.
 96 | 	int aCount = a1 - a0;
 97 | 	int bCount = b1 - b0;
 98 | 	int extended = b1 < sourceCount;
 99 | 	int loadCount = bCount + extended;
100 | 	int fillCount = NT * VT + 1 - loadCount - aCount;
101 | 
102 | 	int* a_shared = indices_shared;
103 | 	int* b_shared = indices_shared + aCount + (int)loadPrecedingB;
104 | 
105 | 	// Load the B values.
106 | //	DeviceMemToMemLoop<NT>(bCount + extended + (int)loadPrecedingB,
107 | //		b_global + b0 - (int)loadPrecedingB, tid,
108 | //		b_shared - (int)loadPrecedingB);
109 | 
110 | 	for(int i = tid - (int)loadPrecedingB; i < bCount + extended; i += NT)
111 | 		b_shared[i] = b_global[b0 + i];
112 | 
113 | 	// Fill the end of the array with destCount.
114 | 	for(int i = tid + extended; i < fillCount; i += NT)
115 | 		b_shared[bCount + i] = destCount;
116 | 	__syncthreads();
117 | 
118 | 	// Run a merge path to find the start of the serial merge for each thread.
119 | 	int diag = VT * tid;
120 | 	int mp = MergePath<MgpuBoundsUpper>(mgpu::counting_iterator<int>(a0),
121 | 		aCount, b_shared, bCount, diag, mgpu::less<int>());
122 | 
123 | 	int a0tid = a0 + mp;
124 | 	int b0tid = diag - mp;
125 | 
126 | 	// Subtract 1 from b0 because we want to return upper_bound - 1.
127 | 	DeviceSerialLoadBalanceSearch<VT, false>(b_shared, a0tid, a1, b0 - 1,
128 | 		b0tid, bCount, a_shared - a0);
129 | 	__syncthreads();
130 | 
131 | 	b0 -= (int)loadPrecedingB;
132 | 	return make_int4(a0, a1, b0, b1);
133 | }
134 | 
135 | 
136 | } // namespace mgpu
137 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/include/contrib/moderngpu/include/device/ctascan.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "../mgpuenums.h"
 38 | #include "deviceutil.cuh"
 39 | #include "intrinsics.cuh"
 40 | 
 41 | namespace mgpu {
 42 | 
 43 | ////////////////////////////////////////////////////////////////////////////////
 44 | // CTAReduce
 45 | 
 46 | template<int NT, typename Op = mgpu::plus<int> >
 47 | struct CTAReduce {
 48 | 	typedef typename Op::first_argument_type T;
 49 | 	enum { Size = NT, Capacity = NT };
 50 | 	struct Storage { T shared[Capacity]; };
 51 | 
 52 | 	MGPU_DEVICE static T Reduce(int tid, T x, Storage& storage, Op op = Op()) {
 53 | 		storage.shared[tid] = x;
 54 | 		__syncthreads();
 55 | 
 56 | 		// Fold the data in half with each pass.
 57 | 		#pragma unroll
 58 | 		for(int destCount = NT / 2; destCount >= 1; destCount /= 2) {
 59 | 			if(tid < destCount) {
 60 | 				// Read from the right half and store to the left half.
 61 | 				x = op(x, storage.shared[destCount + tid]);
 62 | 				storage.shared[tid] = x;
 63 | 			}
 64 | 			__syncthreads();
 65 | 		}
 66 | 		T total = storage.shared[0];
 67 | 		__syncthreads();
 68 | 		return total;
 69 | 	}
 70 | };
 71 | 
 72 | #if __CUDA_ARCH__ >= 300
 73 | 
 74 | template<int NT>
 75 | struct CTAReduce<NT, mgpu::plus<int> > {
 76 | 	typedef mgpu::plus<int> Op;
 77 | 	typedef int T;
 78 | 	enum { Size = NT, Capacity = WARP_SIZE };
 79 | 	struct Storage { int shared[Capacity]; };
 80 | 
 81 | 	MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage,
 82 | 		Op op = Op()) {
 83 | 
 84 | 		const int NumSections = WARP_SIZE;
 85 | 		const int SecSize = NT / NumSections;
 86 | 		int lane = (SecSize - 1) & tid;
 87 | 		int sec = tid / SecSize;
 88 | 
 89 | 		// In the first phase, threads cooperatively find the reduction within
 90 | 		// their segment. The segments are SecSize threads (NT / WARP_SIZE)
 91 | 		// wide.
 92 | 		#pragma unroll
 93 | 		for(int offset = 1; offset < SecSize; offset *= 2)
 94 | 			x = shfl_add(x, offset, SecSize);
 95 | 
 96 | 		// The last thread in each segment stores the local reduction to shared
 97 | 		// memory.
 98 | 		if(SecSize - 1 == lane) storage.shared[sec] = x;
 99 | 		__syncthreads();
100 | 
101 | 		// Reduce the totals of each input segment. The spine is WARP_SIZE
102 | 		// threads wide.
103 | 		if(tid < NumSections) {
104 | 			x = storage.shared[tid];
105 | 			#pragma unroll
106 | 			for(int offset = 1; offset < NumSections; offset *= 2)
107 | 				x = shfl_add(x, offset, NumSections);
108 | 			storage.shared[tid] = x;
109 | 		}
110 | 		__syncthreads();
111 | 
112 | 		int reduction = storage.shared[NumSections - 1];
113 | 		__syncthreads();
114 | 
115 | 		return reduction;
116 | 	}
117 | };
118 | 
119 | template<int NT>
120 | struct CTAReduce<NT, mgpu::maximum<int> > {
121 | 	typedef mgpu::maximum<int> Op;
122 | 	enum { Size = NT, Capacity = WARP_SIZE };
123 | 	struct Storage { int shared[Capacity]; };
124 | 
125 | 	MGPU_DEVICE static int Reduce(int tid, int x, Storage& storage,
126 | 		Op op = Op()) {
127 | 
128 | 		const int NumSections = WARP_SIZE;
129 | 		const int SecSize = NT / NumSections;
130 | 		int lane = (SecSize - 1) & tid;
131 | 		int sec = tid / SecSize;
132 | 
133 | 		#pragma unroll
134 | 		for(int offset = 1; offset < SecSize; offset *= 2)
135 | 			x = shfl_max(x, offset, SecSize);
136 | 
137 | 		if(SecSize - 1 == lane) storage.shared[sec] = x;
138 | 		__syncthreads();
139 | 
140 | 		if(tid < NumSections) {
141 | 			x = storage.shared[tid];
142 | 			#pragma unroll
143 | 			for(int offset = 1; offset < NumSections; offset *= 2)
144 | 				x = shfl_max(x, offset, NumSections);
145 | 			storage.shared[tid] = x;
146 | 		}
147 | 		__syncthreads();
148 | 
149 | 		int reduction = storage.shared[NumSections - 1];
150 | 		__syncthreads();
151 | 
152 | 		return reduction;
153 | 	}
154 | };
155 | 
156 | #endif // __CUDA_ARCH__ >= 300
157 | 
158 | ////////////////////////////////////////////////////////////////////////////////
159 | // CTAScan
160 | 
161 | template<int NT, typename Op = mgpu::plus<int> >
162 | struct CTAScan {
163 | 	typedef typename Op::result_type T;
164 | 	enum { Size = NT, Capacity = 2 * NT + 1 };
165 | 	struct Storage { T shared[Capacity]; };
166 | 
167 | 	MGPU_DEVICE static T Scan(int tid, T x, Storage& storage, T* total,
168 | 		MgpuScanType type = MgpuScanTypeExc, T identity = (T)0, Op op = Op()) {
169 | 
170 | 		storage.shared[tid] = x;
171 | 		int first = 0;
172 | 		__syncthreads();
173 | 
174 | 		#pragma unroll
175 | 		for(int offset = 1; offset < NT; offset += offset) {
176 | 			if(tid >= offset)
177 | 				x = op(storage.shared[first + tid - offset], x);
178 | 			first = NT - first;
179 | 			storage.shared[first + tid] = x;
180 | 			__syncthreads();
181 | 		}
182 | 		*total = storage.shared[first + NT - 1];
183 | 
184 | 		if(MgpuScanTypeExc == type)
185 | 			x = tid ? storage.shared[first + tid - 1] : identity;
186 | 
187 | 		__syncthreads();
188 | 		return x;
189 | 	}
190 | 	MGPU_DEVICE static T Scan(int tid, T x, Storage& storage) {
191 | 		T total;
192 | 		return Scan(tid, x, storage, &total, MgpuScanTypeExc, (T)0, Op());
193 | 	}
194 | };
195 | 
196 | ////////////////////////////////////////////////////////////////////////////////
197 | // Special partial specialization for CTAScan<NT, ScanOpAdd> on Kepler.
198 | // This uses the shfl intrinsic to reduce scan latency.
199 | 
200 | #if __CUDA_ARCH__ >= 300
201 | 
202 | template<int NT>
203 | struct CTAScan<NT, mgpu::plus<int> > {
204 | 	typedef mgpu::plus<int> Op;
205 | 	enum { Size = NT, NumSegments = WARP_SIZE, SegSize = NT / NumSegments };
206 | 	enum { Capacity = NumSegments + 1 };
207 | 	struct Storage { int shared[Capacity + 1]; };
208 | 
209 | 	MGPU_DEVICE static int Scan(int tid, int x, Storage& storage, int* total,
210 | 		MgpuScanType type = MgpuScanTypeExc, int identity = 0, Op op = Op()) {
211 | 
212 | 		// Define WARP_SIZE segments that are NT / WARP_SIZE large.
213 | 		// Each warp makes log(SegSize) shfl_add calls.
214 | 		// The spine makes log(WARP_SIZE) shfl_add calls.
215 | 		int lane = (SegSize - 1) & tid;
216 | 		int segment = tid / SegSize;
217 | 
218 | 		// Scan each segment using shfl_add.
219 | 		int scan = x;
220 | 		#pragma unroll
221 | 		for(int offset = 1; offset < SegSize; offset *= 2)
222 | 			scan = shfl_add(scan, offset, SegSize);
223 | 
224 | 		// Store the reduction (last element) of each segment into storage.
225 | 		if(SegSize - 1 == lane) storage.shared[segment] = scan;
226 | 		__syncthreads();
227 | 
228 | 		// Warp 0 does a full shfl warp scan on the partials. The total is
229 | 		// stored to shared[NumSegments]. (NumSegments = WARP_SIZE)
230 | 		if(tid < NumSegments) {
231 | 			int y = storage.shared[tid];
232 | 			int scan = y;
233 | 			#pragma unroll
234 | 			for(int offset = 1; offset < NumSegments; offset *= 2)
235 | 				scan = shfl_add(scan, offset, NumSegments);
236 | 			storage.shared[tid] = scan - y;
237 | 			if(NumSegments - 1 == tid) storage.shared[NumSegments] = scan;
238 | 		}
239 | 		__syncthreads();
240 | 
241 | 		// Add the scanned partials back in and convert to exclusive scan.
242 | 		scan += storage.shared[segment];
243 | 		if(MgpuScanTypeExc == type) {
244 | 			scan -= x;
245 | 			if(identity && !tid) scan = identity;
246 | 		}
247 | 		*total = storage.shared[NumSegments];
248 | 		__syncthreads();
249 | 
250 | 		return scan;
251 | 	}
252 | 	MGPU_DEVICE static int Scan(int tid, int x, Storage& storage) {
253 | 		int total;
254 | 		return Scan(tid, x, storage, &total, MgpuScanTypeExc, 0);
255 | 	}
256 | };
257 | 
258 | #endif // __CUDA_ARCH__ >= 300
259 | 
260 | ////////////////////////////////////////////////////////////////////////////////
261 | // CTABinaryScan
262 | 
263 | template<int NT>
264 | MGPU_DEVICE int CTABinaryScan(int tid, bool x, int* shared, int* total) {
265 | 	const int NumWarps = NT / WARP_SIZE;
266 | 	int warp = tid / WARP_SIZE;
267 | 	int lane = (WARP_SIZE - 1);
268 | 
269 | 	// Store the bit totals for each warp.
270 | 	uint bits = __ballot(x);
271 | 	shared[warp] = popc(bits);
272 | 	__syncthreads();
273 | 
274 | #if __CUDA_ARCH__ >= 300
275 | 	if(tid < NumWarps) {
276 | 		int x = shared[tid];
277 | 		int scan = x;
278 | 		#pragma unroll
279 | 		for(int offset = 1; offset < NumWarps; offset *= 2)
280 | 			scan = shfl_add(scan, offset, NumWarps);
281 | 		shared[tid] = scan - x;
282 | 	}
283 | 	__syncthreads();
284 | 
285 | #else
286 | 	// Thread 0 scans warp totals.
287 | 	if(!tid) {
288 | 		int scan = 0;
289 | 		#pragma unroll
290 | 		for(int i = 0; i < NumWarps; ++i) {
291 | 			int y = shared[i];
292 | 			shared[i] = scan;
293 | 			scan += y;
294 | 		}
295 | 		shared[NumWarps] = scan;
296 | 	}
297 | 	__syncthreads();
298 | 
299 | #endif // __CUDA_ARCH__ >= 300
300 | 
301 | 	// Add the warp scan back into the partials.
302 | 	int scan = shared[warp] + __popc(bfe(bits, 0, lane));
303 | 	*total = shared[NumWarps];
304 | 	__syncthreads();
305 | 	return scan;
306 | }
307 | 
308 | } // namespace mgpu
309 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/include/contrib/moderngpu/include/device/ctasearch.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "deviceutil.cuh"
 38 | #include "../mgpudevice.cuh"
 39 | 
 40 | namespace mgpu {
 41 | 
 42 | template<MgpuBounds Bounds, typename IntT, typename It, typename T,
 43 | 	typename Comp>
 44 | MGPU_HOST_DEVICE void BinarySearchIt(It data, int& begin, int& end, T key,
 45 | 	int shift, Comp comp) {
 46 | 
 47 | 	IntT scale = (1<< shift) - 1;
 48 | 	int mid = (int)((begin + scale * end)>> shift);
 49 | 
 50 | 	T key2 = data[mid];
 51 | 	bool pred = (MgpuBoundsUpper == Bounds) ?
 52 | 		!comp(key, key2) :
 53 | 		comp(key2, key);
 54 | 	if(pred) begin = mid + 1;
 55 | 	else end = mid;
 56 | }
 57 | 
 58 | template<MgpuBounds Bounds, typename IntT, typename T, typename It,
 59 | 	typename Comp>
 60 | MGPU_HOST_DEVICE int BiasedBinarySearch(It data, int count, T key, int levels,
 61 | 	Comp comp) {
 62 | 
 63 | 	int begin = 0;
 64 | 	int end = count;
 65 | 
 66 | 	if(levels >= 4 && begin < end)
 67 | 		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 9, comp);
 68 | 	if(levels >= 3 && begin < end)
 69 | 		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 7, comp);
 70 | 	if(levels >= 2 && begin < end)
 71 | 		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 5, comp);
 72 | 	if(levels >= 1 && begin < end)
 73 | 		BinarySearchIt<Bounds, IntT>(data, begin, end, key, 4, comp);
 74 | 
 75 | 	while(begin < end)
 76 | 		BinarySearchIt<Bounds, int>(data, begin, end, key, 1, comp);
 77 | 	return begin;
 78 | }
 79 | 
 80 | template<MgpuBounds Bounds, typename T, typename It, typename Comp>
 81 | MGPU_HOST_DEVICE int BinarySearch(It data, int count, T key, Comp comp) {
 82 | 	int begin = 0;
 83 | 	int end = count;
 84 | 	while(begin < end)
 85 | 		BinarySearchIt<Bounds, int>(data, begin, end, key, 1, comp);
 86 | 	return begin;
 87 | }
 88 | 
 89 | ////////////////////////////////////////////////////////////////////////////////
 90 | // MergePath search
 91 | 
 92 | template<MgpuBounds Bounds, typename It1, typename It2, typename Comp>
 93 | MGPU_HOST_DEVICE int MergePath(It1 a, int aCount, It2 b, int bCount, int diag,
 94 | 	Comp comp) {
 95 | 
 96 | 	typedef typename std::iterator_traits<It1>::value_type T;
 97 | 	int begin = max(0, diag - bCount);
 98 | 	int end = min(diag, aCount);
 99 | 
100 | 	while(begin < end) {
101 | 		int mid = (begin + end)>> 1;
102 | 		T aKey = a[mid];
103 | 		T bKey = b[diag - 1 - mid];
104 | 		bool pred = (MgpuBoundsUpper == Bounds) ?
105 | 			comp(aKey, bKey) :
106 | 			!comp(bKey, aKey);
107 | 		if(pred) begin = mid + 1;
108 | 		else end = mid;
109 | 	}
110 | 	return begin;
111 | }
112 | 
113 | 
114 | ////////////////////////////////////////////////////////////////////////////////
115 | // SegmentedMergePath search
116 | 
117 | template<typename InputIt, typename Comp>
118 | MGPU_HOST_DEVICE int SegmentedMergePath(InputIt keys, int aOffset, int aCount,
119 | 	int bOffset, int bCount, int leftEnd, int rightStart, int diag, Comp comp) {
120 | 
121 | 	// leftEnd and rightStart are defined from the origin, and diag is defined
122 | 	// from aOffset.
123 | 	// We only need to run a Merge Path search if the diagonal intersects the
124 | 	// segment that strides the left and right halves (i.e. is between leftEnd
125 | 	// and rightStart).
126 | 	if(aOffset + diag <= leftEnd) return diag;
127 | 	if(aOffset + diag >= rightStart) return aCount;
128 | 
129 | 	bCount = min(bCount, rightStart - bOffset);
130 | 	int begin = max(max(leftEnd - aOffset, 0), diag - bCount);
131 | 	int end = min(diag, aCount);
132 | 
133 | 	while(begin < end) {
134 | 		int mid = (begin + end)>> 1;
135 | 		int ai = aOffset + mid;
136 | 		int bi = bOffset + diag - 1 - mid;
137 | 
138 | 		bool pred = !comp(keys[bi], keys[ai]);
139 | 		if(pred) begin = mid + 1;
140 | 		else end = mid;
141 | 	}
142 | 	return begin;
143 | }
144 | 
145 | ////////////////////////////////////////////////////////////////////////////////
146 | // BalancedPath search
147 | 
148 | template<bool Duplicates, typename IntT, typename InputIt1, typename InputIt2,
149 | 	typename Comp>
150 | MGPU_HOST_DEVICE int2 BalancedPath(InputIt1 a, int aCount, InputIt2 b,
151 | 	int bCount, int diag, int levels, Comp comp) {
152 | 
153 | 	typedef typename std::iterator_traits<InputIt1>::value_type T;
154 | 
155 | 	int p = MergePath<MgpuBoundsLower>(a, aCount, b, bCount, diag, comp);
156 | 	int aIndex = p;
157 | 	int bIndex = diag - p;
158 | 
159 | 	bool star = false;
160 | 	if(bIndex < bCount) {
161 | 		if(Duplicates) {
162 | 			T x = b[bIndex];
163 | 
164 | 			// Search for the beginning of the duplicate run in both A and B.
165 | 			// Because
166 | 			int aStart = BiasedBinarySearch<MgpuBoundsLower, IntT>(a, aIndex, x,
167 | 				levels, comp);
168 | 			int bStart = BiasedBinarySearch<MgpuBoundsLower, IntT>(b, bIndex, x,
169 | 				levels, comp);
170 | 
171 | 			// The distance between the merge path and the lower_bound is the
172 | 			// 'run'. We add up the a- and b- runs and evenly distribute them to
173 | 			// get a stairstep path.
174 | 			int aRun = aIndex - aStart;
175 | 			int bRun = bIndex - bStart;
176 | 			int xCount = aRun + bRun;
177 | 
178 | 			// Attempt to advance b and regress a.
179 | 			int bAdvance = max(xCount>> 1, bRun);
180 | 			int bEnd = min(bCount, bStart + bAdvance + 1);
181 | 			int bRunEnd = BinarySearch<MgpuBoundsUpper>(b + bIndex,
182 | 				bEnd - bIndex, x, comp) + bIndex;
183 | 			bRun = bRunEnd - bStart;
184 | 
185 | 			bAdvance = min(bAdvance, bRun);
186 | 			int aAdvance = xCount - bAdvance;
187 | 
188 | 			bool roundUp = (aAdvance == bAdvance + 1) && (bAdvance < bRun);
189 | 			aIndex = aStart + aAdvance;
190 | 
191 | 			if(roundUp) star = true;
192 | 		} else {
193 | 			if(aIndex && aCount) {
194 | 				T aKey = a[aIndex - 1];
195 | 				T bKey = b[bIndex];
196 | 
197 | 				// If the last consumed element in A (aIndex - 1) is the same as
198 | 				// the next element in B (bIndex), we're sitting at a starred
199 | 				// partition.
200 | 				if(!comp(aKey, bKey)) star = true;
201 | 			}
202 | 		}
203 | 	}
204 | 	return make_int2(aIndex, star);
205 | }
206 | 
207 | } // namespace mgpu
208 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/include/contrib/moderngpu/include/device/ctasegreduce.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "ctasegscan.cuh"
 38 | #include "ctasearch.cuh"
 39 | 
 40 | namespace mgpu {
 41 | 
 42 | ////////////////////////////////////////////////////////////////////////////////
 43 | // Segmented reduce utility functions.
 44 | 
 45 | // Extract the upper-bound indices from the coded ranges. Decrement to include
 46 | // the first addressed row/segment.
 47 | 
 48 | struct SegReduceRange {
 49 | 	int begin;
 50 | 	int end;
 51 | 	int total;
 52 | 	bool flushLast;
 53 | };
 54 | 
 55 | MGPU_DEVICE SegReduceRange DeviceShiftRange(int limit0, int limit1) {
 56 | 	SegReduceRange range;
 57 | 	range.begin = 0x7fffffff & limit0;
 58 | 	range.end = 0x7fffffff & limit1;
 59 | 	range.total = range.end - range.begin;
 60 | 	range.flushLast = 0 == (0x80000000 & limit1);
 61 | 	range.end += !range.flushLast;
 62 | 	return range;
 63 | }
 64 | 
 65 | // Reconstitute row/segment indices from a starting row index and packed end
 66 | // flags. Used for pre-processed versions of interval reduce and interval Spmv.
 67 | template<int VT>
 68 | MGPU_DEVICE void DeviceExpandFlagsToRows(int first, int endFlags,
 69 | 	int rows[VT + 1]) {
 70 | 
 71 | 	rows[0] = first;
 72 | 	#pragma unroll
 73 | 	for(int i = 0; i < VT; ++i) {
 74 | 		if((1<< i) & endFlags) ++first;
 75 | 		rows[i + 1] = first;
 76 | 	}
 77 | }
 78 | 
 79 | ////////////////////////////////////////////////////////////////////////////////
 80 | // After loading CSR terms into shared memory, each thread binary searches
 81 | // (upper-bound) to find its starting point. Each thread then walks forward,
 82 | // emitting the csr0-relative row indices to register.
 83 | 
 84 | template<int NT, int VT>
 85 | MGPU_DEVICE int DeviceExpandCsrRows(int tidOffset, int* csr_shared,
 86 | 	int numRows, int end, int rows[VT + 1], int rowStarts[VT]) {
 87 | 
 88 | 	// Each thread binary searches for its starting row.
 89 | 	int row = BinarySearch<MgpuBoundsUpper>(csr_shared, numRows, tidOffset,
 90 | 		mgpu::less<int>()) - 1;
 91 | 
 92 | 	// Each thread starts at row and scans forward, emitting row IDs into
 93 | 	// register. Store the CTA-local row index (starts at 0) to rows and the
 94 | 	// start of the row (globally) to rowStarts.
 95 | 	int curOffset = csr_shared[row];
 96 | 	int nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end;
 97 | 
 98 | 	rows[0] = row;
 99 | 	rowStarts[0] = curOffset;
100 | 	int endFlags = 0;
101 | 
102 | 	#pragma unroll
103 | 	for(int i = 1; i <= VT; ++i) {
104 | 		// Advance the row cursor when the iterator hits the next row offset.
105 | 		if(tidOffset + i == nextOffset) {
106 | 			// Set an end flag when the cursor advances to the next row.
107 | 			endFlags |= 1<< (i - 1);
108 | 
109 | 			// Advance the cursor and load the next row offset.
110 | 			++row;
111 | 			curOffset = nextOffset;
112 | 			nextOffset = (row + 1 < numRows) ? csr_shared[row + 1] : end;
113 | 		}
114 | 		rows[i] = row;
115 | 		if(i < VT) rowStarts[i] = curOffset;
116 | 	}
117 | 	__syncthreads();
118 | 
119 | 	return endFlags;
120 | }
121 | 
122 | ////////////////////////////////////////////////////////////////////////////////
123 | // DeviceSegReducePrepare
124 | // Expand non-empty interval of CSR elements into row indices. Compute end-flags
125 | // by comparing adjacent row IDs.
126 | 
127 | // DeviceSegReducePrepare may be called either by a pre-processing kernel or by
128 | // the kernel that actually evaluates the segmented reduction if no preprocesing
129 | // is desired.
130 | struct SegReduceTerms {
131 | 	int endFlags;
132 | 	int tidDelta;
133 | };
134 | 
135 | template<int NT, int VT>
136 | MGPU_DEVICE SegReduceTerms DeviceSegReducePrepare(int* csr_shared, int numRows,
137 | 	int tid, int gid, bool flushLast, int rows[VT + 1], int rowStarts[VT]) {
138 | 
139 | 	// Pass a sentinel (end) to point to the next segment start. If we flush,
140 | 	// this is the end of this tile. Otherwise it is INT_MAX
141 | 	int endFlags = DeviceExpandCsrRows<NT, VT>(gid + VT * tid, csr_shared,
142 | 		numRows, flushLast ? (gid + NT * VT) : INT_MAX, rows, rowStarts);
143 | 
144 | 	// Find the distance to to scan to compute carry-in for each thread. Use the
145 | 	// existance of an end flag anywhere in the thread to determine if carry-out
146 | 	// values from the left should propagate through to the right.
147 | 	int tidDelta = DeviceFindSegScanDelta<NT>(tid, rows[0] != rows[VT],
148 | 		csr_shared);
149 | 
150 | 	SegReduceTerms terms = { endFlags, tidDelta };
151 | 	return terms;
152 | }
153 | 
154 | ////////////////////////////////////////////////////////////////////////////////
155 | // CTASegReduce
156 | // Core segmented reduction code. Supports fast-path and slow-path for intra-CTA
157 | // segmented reduction. Stores partials to global memory.
158 | // Callers feed CTASegReduce::ReduceToGlobal values in thread order.
159 | template<int NT, int VT, bool HalfCapacity, typename T, typename Op>
160 | struct CTASegReduce {
161 | 	typedef CTASegScan<NT, Op> SegScan;
162 | 
163 | 	enum {
164 | 		NV = NT * VT,
165 | 		Capacity = HalfCapacity ? (NV / 2) : NV
166 | 	};
167 | 
168 | 	union Storage {
169 | 		typename SegScan::Storage segScanStorage;
170 | 		T values[Capacity];
171 | 	};
172 | 
173 | 	template<typename DestIt>
174 | 	MGPU_DEVICE static void ReduceToGlobal(const int rows[VT + 1], int total,
175 | 		int tidDelta, int startRow, int block, int tid, T data[VT],
176 | 		DestIt dest_global, T* carryOut_global, T identity, Op op,
177 | 		Storage& storage) {
178 | 
179 | 		// Run a segmented scan within the thread.
180 | 		T x, localScan[VT];
181 | 		#pragma unroll
182 | 		for(int i = 0; i < VT; ++i) {
183 | 			x = i ? op(x, data[i]) : data[i];
184 | 			localScan[i] = x;
185 | 			if(rows[i] != rows[i + 1]) x = identity;
186 | 		}
187 | 
188 | 		// Run a parallel segmented scan over the carry-out values to compute
189 | 		// carry-in.
190 | 		T carryOut;
191 | 		T carryIn = SegScan::SegScanDelta(tid, tidDelta, x,
192 | 			storage.segScanStorage, &carryOut, identity, op);
193 | 
194 | 		// Store the carry-out for the entire CTA to global memory.
195 | 		if(!tid) carryOut_global[block] = carryOut;
196 | 
197 | 		dest_global += startRow;
198 | 		if(HalfCapacity && total > Capacity) {
199 | 			// Add carry-in to each thread-local scan value. Store directly
200 | 			// to global.
201 | 			#pragma unroll
202 | 			for(int i = 0; i < VT; ++i) {
203 | 				// Add the carry-in to the local scan.
204 | 				T x2 = op(carryIn, localScan[i]);
205 | 
206 | 				// Store on the end flag and clear the carry-in.
207 | 				if(rows[i] != rows[i + 1]) {
208 | 					carryIn = identity;
209 | 					dest_global[rows[i]] = x2;
210 | 				}
211 | 			}
212 | 		} else {
213 | 			// All partials fit in shared memory. Add carry-in to each thread-
214 | 			// local scan value.
215 | 			#pragma unroll
216 | 			for(int i = 0; i < VT; ++i) {
217 | 				// Add the carry-in to the local scan.
218 | 				T x2 = op(carryIn, localScan[i]);
219 | 
220 | 				// Store reduction when the segment changes and clear the
221 | 				// carry-in.
222 | 				if(rows[i] != rows[i + 1]) {
223 | 					storage.values[rows[i]] = x2;
224 | 					carryIn = identity;
225 | 				}
226 | 			}
227 | 			__syncthreads();
228 | 
229 | 			// Cooperatively store reductions to global memory.
230 | 			for(int index = tid; index < total; index += NT)
231 | 				dest_global[index] = storage.values[index];
232 | 			__syncthreads();
233 | 		}
234 | 	}
235 | };
236 | 
237 | } // namespace mgpu
238 | 
239 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/include/contrib/moderngpu/include/device/ctasegscan.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "ctascan.cuh"
 38 | 
 39 | namespace mgpu {
 40 | 
 41 | ////////////////////////////////////////////////////////////////////////////////
 42 | // DeviceFindSegScanDelta
 43 | // Runs an inclusive max-index scan over binary inputs.
 44 | 
 45 | template<int NT>
 46 | MGPU_DEVICE int DeviceFindSegScanDelta(int tid, bool flag, int* delta_shared) {
 47 | 	const int NumWarps = NT / 32;
 48 | 
 49 | 	int warp = tid / 32;
 50 | 	int lane = 31 & tid;
 51 | 	uint warpMask = 0xffffffff>> (31 - lane);		// inclusive search
 52 | 	uint ctaMask = 0x7fffffff>> (31 - lane);		// exclusive search
 53 | 
 54 | 	uint warpBits = __ballot(flag);
 55 | 	delta_shared[warp] = warpBits;
 56 | 	__syncthreads();
 57 | 
 58 | 	if(tid < NumWarps) {
 59 | 		uint ctaBits = __ballot(0 != delta_shared[tid]);
 60 | 		int warpSegment = 31 - clz(ctaMask & ctaBits);
 61 | 		int start = (-1 != warpSegment) ?
 62 | 			(31 - clz(delta_shared[warpSegment]) + 32 * warpSegment) : 0;
 63 | 		delta_shared[NumWarps + tid] = start;
 64 | 	}
 65 | 	__syncthreads();
 66 | 
 67 | 	// Find the closest flag to the left of this thread within the warp.
 68 | 	// Include the flag for this thread.
 69 | 	int start = 31 - clz(warpMask & warpBits);
 70 | 	if(-1 != start) start += ~31 & tid;
 71 | 	else start = delta_shared[NumWarps + warp];
 72 | 	__syncthreads();
 73 | 
 74 | 	return tid - start;
 75 | }
 76 | 
 77 | ////////////////////////////////////////////////////////////////////////////////
 78 | // CTASegScan
 79 | 
 80 | template<int NT, typename _Op = mgpu::plus<int> >
 81 | struct CTASegScan {
 82 | 	typedef _Op Op;
 83 | 	typedef typename Op::result_type T;
 84 | 	enum { NumWarps = NT / 32, Size = NT, Capacity = 2 * NT };
 85 | 	union Storage {
 86 | 		int delta[NumWarps];
 87 | 		T values[Capacity];
 88 | 	};
 89 | 
 90 | 	// Each thread passes the reduction of the LAST SEGMENT that it covers.
 91 | 	// flag is set to true if there's at least one segment flag in the thread.
 92 | 	// SegScan returns the reduction of values for the first segment in this
 93 | 	// thread over the preceding threads.
 94 | 	// Return the value init for the first thread.
 95 | 
 96 | 	// When scanning single elements per thread, interpret the flag as a BEGIN
 97 | 	// FLAG. If tid's flag is set, its value belongs to thread tid + 1, not
 98 | 	// thread tid.
 99 | 
100 | 	// The function returns the reduction of the last segment in the CTA.
101 | 
102 | 	MGPU_DEVICE static T SegScanDelta(int tid, int tidDelta, T x,
103 | 		Storage& storage, T* carryOut, T identity = (T)0, Op op = Op()) {
104 | 
105 | 		// Run an inclusive scan
106 | 		int first = 0;
107 | 		storage.values[first + tid] = x;
108 | 		__syncthreads();
109 | 
110 | 		#pragma unroll
111 | 		for(int offset = 1; offset < NT; offset += offset) {
112 | 			if(tidDelta >= offset)
113 | 				x = op(storage.values[first + tid - offset], x);
114 | 			first = NT - first;
115 | 			storage.values[first + tid] = x;
116 | 			__syncthreads();
117 | 		}
118 | 
119 | 		// Get the exclusive scan.
120 | 		x = tid ? storage.values[first + tid - 1] : identity;
121 | 		*carryOut = storage.values[first + NT - 1];
122 | 		__syncthreads();
123 | 		return x;
124 | 	}
125 | 
126 | 	MGPU_DEVICE static T SegScan(int tid, T x, bool flag, Storage& storage,
127 | 		T* carryOut, T identity = (T)0, Op op = Op()) {
128 | 
129 | 		// Find the left-most thread that covers the first segment of this
130 | 		// thread.
131 | 		int tidDelta = DeviceFindSegScanDelta<NT>(tid, flag, storage.delta);
132 | 
133 | 		return SegScanDelta(tid, tidDelta, x, storage, carryOut, identity, op);
134 | 	}
135 | };
136 | 
137 | } // namespace mgpu
138 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/include/contrib/moderngpu/include/device/ctasortedsearch.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "../mgpudevice.cuh"
 38 | #include "ctasearch.cuh"
 39 | 
 40 | namespace mgpu {
 41 | 
 42 | 
 43 | ////////////////////////////////////////////////////////////////////////////////
 44 | // DeviceSerialSearch
 45 | 
 46 | template<int VT, MgpuBounds Bounds, bool RangeCheck, bool IndexA, bool MatchA,
 47 | 	bool IndexB, bool MatchB, typename T, typename Comp>
 48 | MGPU_DEVICE int3 DeviceSerialSearch(const T* keys_shared, int aBegin,
 49 | 	int aEnd, int bBegin, int bEnd, int aOffset, int bOffset, int* indices,
 50 | 	Comp comp) {
 51 | 
 52 | 	const int FlagA = IndexA ? 0x80000000 : 1;
 53 | 	const int FlagB = IndexB ? 0x80000000 : 1;
 54 | 
 55 | 	T aKey = keys_shared[aBegin];
 56 | 	T bKey = keys_shared[bBegin];
 57 | 	T aPrev, bPrev;
 58 | 	if(aBegin > 0) aPrev = keys_shared[aBegin - 1];
 59 | 	if(bBegin > 0) bPrev = keys_shared[bBegin - 1];
 60 | 	int decisions = 0;
 61 | 	int matchCountA = 0;
 62 | 	int matchCountB = 0;
 63 | 
 64 | 	#pragma unroll
 65 | 	for(int i = 0; i < VT; ++i) {
 66 | 		bool p;
 67 | 		if(RangeCheck && aBegin >= aEnd) p = false;
 68 | 		else if(RangeCheck && bBegin >= bEnd) p = true;
 69 | 		else p = (MgpuBoundsUpper == Bounds) ?
 70 | 			comp(aKey, bKey) :
 71 | 			!comp(bKey, aKey);
 72 | 
 73 | 		if(p) {
 74 | 			// aKey is smaller than bKey, so it is inserted before bKey.
 75 | 			// Save bKey's index (bBegin + first) as the result of the search
 76 | 			// and advance to the next needle in A.
 77 | 			bool match = false;
 78 | 			if(MatchA) {
 79 | 				// Test if there is an element in B that matches aKey.
 80 | 				if(MgpuBoundsUpper == Bounds) {
 81 | 					// Upper Bound: We're inserting aKey after bKey. If there
 82 | 					// is a match for aKey it must be bPrev. Check that bPrev
 83 | 					// is in range and equal to aKey.
 84 | 					// The predicate test result !comp(aKey, bPrev) was
 85 | 					// established on the previous A-advancing iteration (it
 86 | 					// failed the comp(aKey, bKey) test to get us to this
 87 | 					// point). Check the other half of the equality condition
 88 | 					// with a second comparison.
 89 | 					bool inRange = !RangeCheck || (bBegin > aEnd);
 90 | 					match = inRange && !comp(bPrev, aKey);
 91 | 				} else {
 92 | 					// Lower Bound: We're inserting aKey before bKey. If there
 93 | 					// is a match for aKey, it must be bKey. Check that bKey
 94 | 					// is in range and equal to aKey.
 95 | 					// The predicate test !comp(bKey, aKey) has established one
 96 | 					// half of the equality condition. We establish the other
 97 | 					// half with a second comparison.
 98 | 					bool inRange = !RangeCheck || (bBegin < bEnd);
 99 | 					match = inRange && !comp(aKey, bKey);
100 | 				}
101 | 			}
102 | 
103 | 			int index = 0;
104 | 		 	if(IndexA) index = bOffset + bBegin;
105 | 			if(match) index |= FlagA;
106 | 			if(IndexA || MatchA) indices[i] = index;
107 | 			matchCountA += match;
108 | 
109 | 			// Mark the decision bit to indicate that this iteration has
110 | 			// progressed A (the needles).
111 | 			decisions |= 1<< i;
112 | 			aPrev = aKey;
113 | 			aKey = keys_shared[++aBegin];
114 | 		} else {
115 | 			// aKey is larger than bKey, so it is inserted after bKey (but we
116 | 			// don't know where yet). Advance the B index to the next element in
117 | 			// the haystack to continue the search for the current needle.
118 | 			bool match = false;
119 | 			if(MatchB) {
120 | 				if(MgpuBoundsUpper == Bounds) {
121 | 					// Upper Bound: aKey is not smaller than bKey. We advance to
122 | 					// the next haystack element in B. If there is a match in A
123 | 					// for bKey it must be aKey. By entering this branch we've
124 | 					// verified that !comp(aKey, bKey). Making the reciprocal
125 | 					// comparison !comp(bKey, aKey) establishes aKey == bKey.
126 | 					bool inRange = !RangeCheck ||
127 | 						((bBegin < bEnd) && (aBegin < aEnd));
128 | 					match = inRange && !comp(bKey, aKey);
129 | 				} else {
130 | 					// Lower Bound: bKey is smaller than aKey. We advance to the
131 | 					// next element in B. If there is a match for bKey, it must
132 | 					// be aPrev. The previous A-advancing iteration proved that
133 | 					// !comp(bKey, aPrev). We test !comp(aPrev, bKey) for the
134 | 					// other half of the equality condition.
135 | 					bool inRange = !RangeCheck ||
136 | 						((bBegin < bEnd) && (aBegin > 0));
137 | 					match = inRange && !comp(aPrev, bKey);
138 | 				}
139 | 			}
140 | 
141 | 			int index = 0;
142 | 			if(IndexB) index = aOffset + aBegin;
143 | 			if(match) index |= FlagB;
144 | 			if(IndexB || MatchB) indices[i] = index;
145 | 			matchCountB += match;
146 | 
147 | 			// Keep the decision bit cleared to indicate that this iteration
148 | 			// has progressed B (the haystack).
149 | 			bPrev = bKey;
150 | 			bKey = keys_shared[++bBegin];
151 | 		}
152 | 	}
153 | 	return make_int3(decisions, matchCountA, matchCountB);
154 | }
155 | 
156 | ////////////////////////////////////////////////////////////////////////////////
157 | // CTASortedSearch
158 | // Take keys in shared memory and return indices and b-match flags in shared
159 | // memory.
160 | // NOTE: This function doesn't do any strided-to-thread order transposes so
161 | // using an even number of values per thread will incur no additional bank
162 | // conflicts.
163 | 
164 | template<int NT, int VT, MgpuBounds Bounds, bool IndexA, bool MatchA,
165 | 	bool IndexB, bool MatchB, typename T, typename Comp>
166 | MGPU_DEVICE int2 CTASortedSearch(T* keys_shared, int aStart, int aCount,
167 | 	int aEnd, int a0, int bStart, int bCount, int bEnd, int b0, bool extended,
168 | 	int tid, int* indices_shared, Comp comp) {
169 | 
170 | 	// Run a merge path to find the start of the serial search for each thread.
171 | 	int diag = VT * tid;
172 | 	int mp = MergePath<Bounds>(keys_shared + aStart, aCount,
173 | 		keys_shared + bStart, bCount, diag, comp);
174 | 	int a0tid = mp;
175 | 	int b0tid = diag - mp;
176 | 
177 | 	// Serial search into register.
178 | 	int3 results;
179 | 	int indices[VT];
180 | 	if(extended)
181 | 		results = DeviceSerialSearch<VT, Bounds, false, IndexA, MatchA, IndexB,
182 | 			MatchB>(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd,
183 | 			a0 - aStart, b0 - bStart, indices, comp);
184 | 	else
185 | 		results = DeviceSerialSearch<VT, Bounds, true, IndexA, MatchA, IndexB,
186 | 			MatchB>(keys_shared, a0tid + aStart, aEnd, b0tid + bStart, bEnd,
187 | 			a0 - aStart, b0 - bStart, indices, comp);
188 | 	__syncthreads();
189 | 
190 | 	// Compact the indices into shared memory. Use the decision bits (set is A,
191 | 	// cleared is B) to select the destination.
192 | 	int decisions = results.x;
193 | 	b0tid += aCount;
194 | 	#pragma unroll
195 | 	for(int i = 0; i < VT; ++i) {
196 | 		if((1<< i) & decisions) {
197 | 			if(IndexA || MatchA) indices_shared[a0tid++] = indices[i];
198 | 		} else {
199 | 			if(IndexB || MatchB) indices_shared[b0tid++] = indices[i];
200 | 		}
201 | 	}
202 | 	__syncthreads();
203 | 
204 | 	// Return the match counts for A and B keys.
205 | 	return make_int2(results.y, results.z);
206 | }
207 | 
208 | } // namespace mgpu
209 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/include/contrib/moderngpu/include/device/deviceutil.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "intrinsics.cuh"
 38 | 
 39 | namespace mgpu {
 40 | 
 41 | // Get the difference between two pointers in bytes.
 42 | MGPU_HOST_DEVICE ptrdiff_t PtrDiff(const void* a, const void* b) {
 43 | 	return (const byte*)b - (const byte*)a;
 44 | }
 45 | 
 46 | // Offset a pointer by i bytes.
 47 | template<typename T>
 48 | MGPU_HOST_DEVICE const T* PtrOffset(const T* p, ptrdiff_t i) {
 49 | 	return (const T*)((const byte*)p + i);
 50 | }
 51 | template<typename T>
 52 | MGPU_HOST_DEVICE T* PtrOffset(T* p, ptrdiff_t i) {
 53 | 	return (T*)((byte*)p + i);
 54 | }
 55 | 
 56 | ////////////////////////////////////////////////////////////////////////////////
 57 | // Task range support
 58 | // Evenly distributes variable-length arrays over a fixed number of CTAs.
 59 | 
 60 | MGPU_HOST int2 DivideTaskRange(int numItems, int numWorkers) {
 61 | 	div_t d = div(numItems, numWorkers);
 62 | 	return make_int2(d.quot, d.rem);
 63 | }
 64 | 
 65 | MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task) {
 66 | 	int2 range;
 67 | 	range.x = task.x * block;
 68 | 	range.x += min(block, task.y);
 69 | 	range.y = range.x + task.x + (block < task.y);
 70 | 	return range;
 71 | }
 72 | 
 73 | MGPU_HOST_DEVICE int2 ComputeTaskRange(int block, int2 task, int blockSize,
 74 | 	int count) {
 75 | 	int2 range = ComputeTaskRange(block, task);
 76 | 	range.x *= blockSize;
 77 | 	range.y = min(count, range.y * blockSize);
 78 | 	return range;
 79 | }
 80 | 
 81 | ////////////////////////////////////////////////////////////////////////////////
 82 | // DeviceExtractHeadFlags
 83 | // Input array flags is a bit array with 32 head flags per word.
 84 | // ExtractThreadHeadFlags returns numBits flags starting at bit index.
 85 | 
 86 | MGPU_HOST_DEVICE uint DeviceExtractHeadFlags(const uint* flags, int index,
 87 | 	int numBits) {
 88 | 
 89 | 	int index2 = index>> 5;
 90 | 	int shift = 31 & index;
 91 | 	uint headFlags = flags[index2]>> shift;
 92 | 	int shifted = 32 - shift;
 93 | 
 94 | 	if(shifted < numBits)
 95 | 		// We also need to shift in the next set of bits.
 96 | 		headFlags = bfi(flags[index2 + 1], headFlags, shifted, shift);
 97 | 	headFlags &= (1<< numBits) - 1;
 98 | 	return headFlags;
 99 | }
100 | 
101 | ////////////////////////////////////////////////////////////////////////////////
102 | // DevicePackHeadFlags
103 | // Pack VT bits per thread at 32 bits/thread. Will consume an integer number of
104 | // words, because CTA size is a multiple of 32. The first NT * VT / 32 threads
105 | // return packed words.
106 | 
107 | template<int NT, int VT>
108 | MGPU_DEVICE uint DevicePackHeadFlags(uint threadBits, int tid,
109 | 	uint* flags_shared) {
110 | 
111 | 	const int WordCount = NT * VT / 32;
112 | 
113 | 	// Each thread stores its thread bits to flags_shared[tid].
114 | 	flags_shared[tid] = threadBits;
115 | 	__syncthreads();
116 | 
117 | 	uint packed = 0;
118 | 	if(tid < WordCount) {
119 | 		const int Items = MGPU_DIV_UP(32, VT);
120 | 		int index = 32 * tid;
121 | 		int first = index / VT;
122 | 		int bit = 0;
123 | 
124 | 		int rem = index - VT * first;
125 | 		packed = flags_shared[first]>> rem;
126 | 		bit = VT - rem;
127 | 		++first;
128 | 
129 | 		#pragma unroll
130 | 		for(int i = 0; i < Items; ++i) {
131 | 			if(i < Items - 1 || bit < 32) {
132 | 				uint x = flags_shared[first + i];
133 | 				if(bit < 32) packed |= x<< bit;
134 | 				bit += VT;
135 | 			}
136 | 		}
137 | 	}
138 | 	__syncthreads();
139 | 
140 | 	return packed;
141 | }
142 | 
143 | } // namespace mgpu
144 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/include/contrib/moderngpu/include/device/serialsets.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "deviceutil.cuh"
 38 | 
 39 | namespace mgpu {
 40 | 
 41 | ////////////////////////////////////////////////////////////////////////////////
 42 | // SerialSetIntersection
 43 | // Emit A if A and B are in range and equal.
 44 | 
 45 | template<int VT, bool RangeCheck, typename T, typename Comp>
 46 | MGPU_DEVICE int SerialSetIntersection(const T* data, int aBegin, int aEnd,
 47 | 	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
 48 | 
 49 | 	const int MinIterations = VT / 2;
 50 | 	int commit = 0;
 51 | 
 52 | 	#pragma unroll
 53 | 	for(int i = 0; i < VT; ++i) {
 54 | 		bool test = RangeCheck ?
 55 | 			((aBegin + bBegin < end) && (aBegin < aEnd) && (bBegin < bEnd)) :
 56 | 			(i < MinIterations || (aBegin + bBegin < end));
 57 | 
 58 | 		if(test) {
 59 | 			T aKey = data[aBegin];
 60 | 			T bKey = data[bBegin];
 61 | 
 62 | 			bool pA = comp(aKey, bKey);
 63 | 			bool pB = comp(bKey, aKey);
 64 | 
 65 | 			// The outputs must come from A by definition of set interection.
 66 | 			results[i] = aKey;
 67 | 			indices[i] = aBegin;
 68 | 
 69 | 			if(!pB) ++aBegin;
 70 | 			if(!pA) ++bBegin;
 71 | 			if(pA == pB) commit |= 1<< i;
 72 | 		}
 73 | 	}
 74 | 	return commit;
 75 | }
 76 | 
 77 | ////////////////////////////////////////////////////////////////////////////////
 78 | // SerialSetUnion
 79 | // Emit A if A <= B. Emit B if B < A.
 80 | 
 81 | template<int VT, bool RangeCheck, typename T, typename Comp>
 82 | MGPU_DEVICE int SerialSetUnion(const T* data, int aBegin, int aEnd,
 83 | 	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
 84 | 
 85 | 	const int MinIterations = VT / 2;
 86 | 	int commit = 0;
 87 | 
 88 | 	#pragma unroll
 89 | 	for(int i = 0; i < VT; ++i) {
 90 | 		bool test = RangeCheck ?
 91 | 			(aBegin + bBegin < end) :
 92 | 			(i < MinIterations || (aBegin + bBegin < end));
 93 | 
 94 | 		if(test) {
 95 | 			T aKey = data[aBegin];
 96 | 			T bKey = data[bBegin];
 97 | 
 98 | 			bool pA = false, pB = false;
 99 | 			if(RangeCheck && aBegin >= aEnd)
100 | 				pB = true;
101 | 			else if(RangeCheck && bBegin >= bEnd)
102 | 				pA = true;
103 | 			else {
104 | 				// Both are in range.
105 | 				pA = comp(aKey, bKey);
106 | 				pB = comp(bKey, aKey);
107 | 			}
108 | 
109 | 			// Output A in case of a tie, so check if b < a.
110 | 			results[i] = pB ? bKey : aKey;
111 | 			indices[i] = pB ? bBegin : aBegin;
112 | 			if(!pB) ++aBegin;
113 | 			if(!pA) ++bBegin;
114 | 			commit |= 1<< i;
115 | 		}
116 | 	}
117 | 	return commit;
118 | }
119 | 
120 | ////////////////////////////////////////////////////////////////////////////////
121 | // SerialSetDifference
122 | // Emit A if A < B.
123 | 
124 | template<int VT, bool RangeCheck, typename T, typename Comp>
125 | MGPU_DEVICE int SerialSetDifference(const T* data, int aBegin, int aEnd,
126 | 	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
127 | 
128 | 	const int MinIterations = VT / 2;
129 | 	int commit = 0;
130 | 
131 | 	#pragma unroll
132 | 	for(int i = 0; i < VT; ++i) {
133 | 		bool test = RangeCheck ?
134 | 			(aBegin + bBegin < end) :
135 | 			(i < MinIterations || (aBegin + bBegin < end));
136 | 		if(test) {
137 | 			T aKey = data[aBegin];
138 | 			T bKey = data[bBegin];
139 | 
140 | 			bool pA = false, pB = false;
141 | 			if(RangeCheck && aBegin >= aEnd)
142 | 				pB = true;
143 | 			else if(RangeCheck && bBegin >= bEnd)
144 | 				pA = true;
145 | 			else {
146 | 				pA = comp(aKey, bKey);
147 | 				pB = comp(bKey, aKey);
148 | 			}
149 | 
150 | 			// The outputs must come from A by definition of set difference.
151 | 			results[i] = aKey;
152 | 			indices[i] = aBegin;
153 | 			if(!pB) ++aBegin;
154 | 			if(!pA) ++bBegin;
155 | 			if(pA) commit |= 1<< i;
156 | 		}
157 | 	}
158 | 	return commit;
159 | }
160 | 
161 | ////////////////////////////////////////////////////////////////////////////////
162 | // SerialSetSymDiff
163 | // Emit A if A < B and emit B if B < A.
164 | 
165 | template<int VT, bool RangeCheck, typename T, typename Comp>
166 | MGPU_DEVICE int SerialSetSymDiff(const T* data, int aBegin, int aEnd,
167 | 	int bBegin, int bEnd, int end, T* results, int* indices, Comp comp) {
168 | 
169 | 	const int MinIterations = VT / 2;
170 | 	int commit = 0;
171 | 
172 | 	#pragma unroll
173 | 	for(int i = 0; i < VT; ++i) {
174 | 		bool test = RangeCheck ?
175 | 			(aBegin + bBegin < end) :
176 | 			(i < MinIterations || (aBegin + bBegin < end));
177 | 		if(test) {
178 | 			T aKey = data[aBegin];
179 | 			T bKey = data[bBegin];
180 | 
181 | 			bool pA = false, pB = false;
182 | 			if(RangeCheck && (bBegin >= bEnd))
183 | 				pA = true;
184 | 			else if(RangeCheck && (aBegin >= aEnd))
185 | 				pB = true;
186 | 			else {
187 | 				pA = comp(aKey, bKey);
188 | 				pB = comp(bKey, aKey);
189 | 			}
190 | 
191 | 			results[i] = pA ? aKey : bKey;
192 | 			indices[i] = pA ? aBegin : bBegin;
193 | 			if(!pA) ++bBegin;
194 | 			if(!pB) ++aBegin;
195 | 			if(pA != pB) commit |= 1<< i;
196 | 		}
197 | 	}
198 | 	return commit;
199 | }
200 | 
201 | ////////////////////////////////////////////////////////////////////////////////
202 | // SerialSetOp
203 | // Uses the MgpuSetOp enum to statically select one of the four serial ops
204 | // above.
205 | 
206 | template<int VT, bool RangeCheck, MgpuSetOp Op, typename T, typename Comp>
207 | MGPU_DEVICE int SerialSetOp(const T* data, int aBegin, int aEnd,
208 | 	int bBegin, int bEnd, int star, T* results, int* indices, Comp comp) {
209 | 
210 | 	int end = aBegin + bBegin + VT - star;
211 | 	if(RangeCheck) end = min(end, aEnd + bEnd);
212 | 	int commit;
213 | 	switch(Op) {
214 | 		case MgpuSetOpIntersection:
215 | 			commit = SerialSetIntersection<VT, RangeCheck>(data, aBegin,
216 | 				aEnd, bBegin, bEnd, end, results, indices, comp);
217 | 			break;
218 | 		case MgpuSetOpUnion:
219 | 			commit = SerialSetUnion<VT, RangeCheck>(data, aBegin, aEnd,
220 | 				bBegin, bEnd, end, results, indices, comp);
221 | 			break;
222 | 		case MgpuSetOpDiff:
223 | 			commit = SerialSetDifference<VT, RangeCheck>(data, aBegin, aEnd,
224 | 				bBegin, bEnd, end, results, indices, comp);
225 | 			break;
226 | 		case MgpuSetOpSymDiff:
227 | 			commit = SerialSetSymDiff<VT, RangeCheck>(data, aBegin, aEnd,
228 | 				bBegin, bEnd, end, results, indices, comp);
229 | 			break;
230 | 	}
231 | 	__syncthreads();
232 | 	return commit;
233 | }
234 | 
235 | } // namespace mgpu
236 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/include/contrib/moderngpu/include/device/sortnetwork.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "deviceutil.cuh"
 38 | 
 39 | namespace mgpu {
 40 | 
 41 | ////////////////////////////////////////////////////////////////////////////////
 42 | // Odd-even transposition sorting network. Sorts keys and values in-place in
 43 | // register.
 44 | // http://en.wikipedia.org/wiki/Odd%E2%80%93even_sort
 45 | 
 46 | // CUDA Compiler does not currently unroll these loops correctly. Write using
 47 | // template loop unrolling.
 48 | /*
 49 | template<int VT, typename T, typename V, typename Comp>
 50 | MGPU_DEVICE void OddEvenTransposeSort(T* keys, V* values, Comp comp) {
 51 | 	#pragma unroll
 52 | 	for(int level = 0; level < VT; ++level) {
 53 | 
 54 | 		#pragma unroll
 55 | 		for(int i = 1 & level; i < VT - 1; i += 2) {
 56 | 			if(comp(keys[i + 1], keys[i])) {
 57 | 				mgpu::swap(keys[i], keys[i + 1]);
 58 | 				mgpu::swap(values[i], values[i + 1]);
 59 | 			}
 60 | 		}
 61 | 	}
 62 | }*/
 63 | 
 64 | template<int I, int VT>
 65 | struct OddEvenTransposeSortT {
 66 | 	// Sort segments marked by head flags. If the head flag between i and i + 1
 67 | 	// is set (so that (2<< i) & flags is true), the values belong to different
 68 | 	// segments and are not swapped.
 69 | 	template<typename K, typename V, typename Comp>
 70 | 	static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) {
 71 | 		#pragma unroll
 72 | 		for(int i = 1 & I; i < VT - 1; i += 2)
 73 | 			if((0 == ((2<< i) & flags)) && comp(keys[i + 1], keys[i])) {
 74 | 				mgpu::swap(keys[i], keys[i + 1]);
 75 | 				mgpu::swap(values[i], values[i + 1]);
 76 | 			}
 77 | 		OddEvenTransposeSortT<I + 1, VT>::Sort(keys, values, flags, comp);
 78 | 	}
 79 | };
 80 | template<int I> struct OddEvenTransposeSortT<I, I> {
 81 | 	template<typename K, typename V, typename Comp>
 82 | 	static MGPU_DEVICE void Sort(K* keys, V* values, int flags, Comp comp) { }
 83 | };
 84 | 
 85 | template<int VT, typename K, typename V, typename Comp>
 86 | MGPU_DEVICE void OddEvenTransposeSort(K* keys, V* values, Comp comp) {
 87 | 	OddEvenTransposeSortT<0, VT>::Sort(keys, values, 0, comp);
 88 | }
 89 | template<int VT, typename K, typename V, typename Comp>
 90 | MGPU_DEVICE void OddEvenTransposeSortFlags(K* keys, V* values, int flags,
 91 | 	Comp comp) {
 92 | 	OddEvenTransposeSortT<0, VT>::Sort(keys, values, flags, comp);
 93 | }
 94 | 
 95 | ////////////////////////////////////////////////////////////////////////////////
 96 | // Batcher Odd-Even Mergesort network
 97 | // Unstable but executes much faster than the transposition sort.
 98 | // http://en.wikipedia.org/wiki/Batcher_odd%E2%80%93even_mergesort
 99 | 
100 | template<int Width, int Low, int Count>
101 | struct OddEvenMergesortT {
102 | 	template<typename K, typename V, typename Comp>
103 | 	MGPU_DEVICE static void CompareAndSwap(K* keys, V* values, int flags,
104 | 		int a, int b, Comp comp) {
105 | 		if(b < Count) {
106 | 			// Mask the bits between a and b. Any head flags in this interval
107 | 			// means the keys are in different segments and must not be swapped.
108 | 			const int Mask = ((2<< b) - 1) ^ ((2<< a) - 1);
109 | 			if(!(Mask & flags) && comp(keys[b], keys[a])) {
110 | 				mgpu::swap(keys[b], keys[a]);
111 | 				mgpu::swap(values[b], values[a]);
112 | 			}
113 | 		}
114 | 	}
115 | 
116 | 	template<int R, int Low2, bool Recurse = 2 * R < Width>
117 | 	struct OddEvenMerge {
118 | 		template<typename K, typename V, typename Comp>
119 | 		MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
120 | 			Comp comp) {
121 | 			// Compare and swap
122 | 			const int M = 2 * R;
123 | 			OddEvenMerge<M, Low2>::Merge(keys, values, flags, comp);
124 | 			OddEvenMerge<M, Low2 + R>::Merge(keys, values, flags, comp);
125 | 
126 | 			#pragma unroll
127 | 			for(int i = Low2 + R; i + R < Low2 + Width; i += M)
128 | 				CompareAndSwap(keys, values, flags, i, i + R, comp);
129 | 		}
130 | 	};
131 | 	template<int R, int Low2>
132 | 	struct OddEvenMerge<R, Low2, false> {
133 | 		template<typename K, typename V, typename Comp>
134 | 		MGPU_DEVICE static void Merge(K* keys, V* values, int flags,
135 | 			Comp comp) {
136 | 			CompareAndSwap(keys, values, flags, Low2, Low2 + R, comp);
137 | 		}
138 | 	};
139 | 
140 | 	template<typename K, typename V, typename Comp>
141 | 	MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
142 | 		Comp comp) {
143 | 
144 | 		const int M = Width / 2;
145 | 		OddEvenMergesortT<M, Low, Count>::Sort(keys, values, flags, comp);
146 | 		OddEvenMergesortT<M, Low + M, Count>::Sort(keys, values, flags, comp);
147 | 		OddEvenMerge<1, Low>::Merge(keys, values, flags, comp);
148 | 	}
149 | };
150 | template<int Low, int Count> struct OddEvenMergesortT<1, Low, Count> {
151 | 	template<typename K, typename V, typename Comp>
152 | 	MGPU_DEVICE static void Sort(K* keys, V* values, int flags,
153 | 		Comp comp) { }
154 | };
155 | 
156 | template<int VT, typename K, typename V, typename Comp>
157 | MGPU_DEVICE void OddEvenMergesort(K* keys, V* values, Comp comp) {
158 | 	const int Width = 1<< sLogPow2<VT, true>::value;
159 | 	OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, 0, comp);
160 | }
161 | template<int VT, typename K, typename V, typename Comp>
162 | MGPU_DEVICE void OddEvenMergesortFlags(K* keys, V* values, int flags,
163 | 	Comp comp) {
164 | 	const int Width = 1<< sLogPow2<VT, true>::value;
165 | 	OddEvenMergesortT<Width, 0, VT>::Sort(keys, values, flags, comp);
166 | }
167 | 
168 | } // namespace mgpu
169 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/include/contrib/moderngpu/include/mgpudevice.cuh:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  *
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include "mgpuenums.h"
 38 | #include "device/deviceutil.cuh"
 39 | 
 40 | namespace mgpu {
 41 | 
 42 | ////////////////////////////////////////////////////////////////////////////////
 43 | // device/loadstore.cuh
 44 | 
 45 | // For 0 <= i < VT:
 46 | //		index = NT * i + tid;
 47 | //		reg[i] = data[index];
 48 | // Synchronize after load.
 49 | template<int NT, int VT, typename InputIt, typename T>
 50 | MGPU_DEVICE void DeviceSharedToReg(InputIt data, int tid, T* reg,
 51 | 	bool sync = true);
 52 | 
 53 | // For 0 <= i < VT:
 54 | //		index = NT * i + tid;
 55 | //		if(index < count) reg[i] = data[index];
 56 | // No synchronize after load.
 57 | template<int NT, int VT, typename InputIt, typename T>
 58 | MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
 59 | 	T* reg, bool sync = false);
 60 | 
 61 | template<int NT, int VT, typename InputIt, typename T>
 62 | MGPU_DEVICE void DeviceGlobalToRegDefault(int count, InputIt data, int tid,
 63 | 	T* reg, T init, bool sync = false);
 64 | 
 65 | // For 0 <= i < VT:
 66 | //		index = NT * i + tid;
 67 | //		if(index < count) reg[i] = data[index];
 68 | // No synchronize after load.
 69 | template<int NT, int VT0, int VT1, typename InputIt, typename T>
 70 | MGPU_DEVICE void DeviceGlobalToReg(int count, InputIt data, int tid,
 71 | 	T* reg, bool sync = false);
 72 | 
 73 | // For 0 <= i < VT:
 74 | //		index = NT * i + tid;
 75 | //		if(index < count) reg[i] = data[index];
 76 | // No synchronize after load.
 77 | template<int NT, int VT0, int VT1, typename InputIt, typename T>
 78 | MGPU_DEVICE void DeviceGlobalToRegDefault2(int count, InputIt data, int tid,
 79 | 	T* reg, T init, bool sync = false);
 80 | 
 81 | // For 0 <= i < VT:
 82 | //		index = NT * i + tid;
 83 | //		if(index < count) reg[i] = data[index];
 84 | // No synchronize after load.
 85 | // No optimized code path for count < NV (smaller generated code).
 86 | template<int NT, int VT, typename InputIt, typename T>
 87 | MGPU_DEVICE void DeviceGlobalToRegLoop(int count, InputIt data, int tid,
 88 | 	T* reg, bool sync = false);
 89 | 
 90 | 
 91 | // For 0 <= i < VT:
 92 | //		index = VT * tid + i.
 93 | //		if(index < count) reg[i] = data[index];
 94 | // No synchronize after load.
 95 | template<int NT, int VT, typename InputIt, typename T>
 96 | MGPU_DEVICE void DeviceGlobalToThread(int count, InputIt data, int tid,
 97 | 	T* reg);
 98 | 
 99 | template<int NT, int VT, typename InputIt, typename T>
100 | MGPU_DEVICE void DeviceGlobalToThreadDefault(int count, InputIt data, int tid,
101 | 	T* reg, T init);
102 | 
103 | // For 0 <= i < VT:
104 | //		index = NT * i + tid;
105 | //		if(index < count) data[index] = reg[i];
106 | // Synchronize after load.
107 | template<int NT, int VT, typename OutputIt, typename T>
108 | MGPU_DEVICE void DeviceRegToShared(const T* reg, int tid, OutputIt dest,
109 | 	bool sync = true);
110 | 
111 | // For 0 <= i < VT:
112 | //		index = NT * i + tid;
113 | //		if(index < count) data[index] = reg[i];
114 | // No synchronize after load.
115 | template<int NT, int VT, typename OutputIt, typename T>
116 | MGPU_DEVICE void DeviceRegToGlobal(int count, const T* reg, int tid,
117 | 	OutputIt dest, bool sync = false);
118 | 
119 | // For 0 <= index < count:
120 | //		dest[index] = source[index];
121 | // This function is intended to replace DeviceGlobalToShared in cases where
122 | // count is much less than NT * VT.
123 | template<int NT, typename InputIt, typename OutputIt>
124 | MGPU_DEVICE void DeviceMemToMemLoop(int count, InputIt source, int tid,
125 | 	OutputIt dest, bool sync = true);
126 | 
127 | // For 0 <= index < count:
128 | //		dest[index] = source[index];
129 | // Synchronize after store.
130 | template<int NT, int VT, typename T, typename OutputIt>
131 | MGPU_DEVICE void DeviceSharedToGlobal(int count, const T* source, int tid,
132 | 	OutputIt dest, bool sync = true);
133 | 
134 | // For 0 <= index < count:
135 | //		dest[index] = source[index];
136 | // Synchronize after store.
137 | template<int NT, int VT, typename InputIt, typename T>
138 | MGPU_DEVICE void DeviceGlobalToShared(int count, InputIt source, int tid,
139 | 	T* dest, bool sync = true);
140 | 
141 | template<int NT, int VT0, int VT1, typename InputIt, typename T>
142 | MGPU_DEVICE void DeviceGlobalToShared2(int count, InputIt source, int tid,
143 | 	T* dest, bool sync = true);
144 | 
145 | // For 0 <= index < count:
146 | //		dest[index] = source[index];
147 | // Synchronize after store.
148 | // No optimized code path for count < NV (smaller generated code).
149 | template<int NT, int VT, typename InputIt, typename T>
150 | MGPU_DEVICE void DeviceGlobalToSharedLoop(int count, InputIt source, int tid,
151 | 	T* dest, bool sync = true);
152 | 
153 | template<int NT, int VT, typename InputIt, typename T>
154 | MGPU_DEVICE void DeviceGlobalToSharedDefault(int count, InputIt source, int tid,
155 | 	T* dest, T init, bool sync = true);
156 | 
157 | template<int NT, int VT0, int VT1, typename InputIt, typename T>
158 | MGPU_DEVICE void DeviceGlobalToSharedDefault2(int count, InputIt source,
159 | 	int tid, T* dest, T init, bool sync = true);
160 | 
161 | // For 0 <= index < count:
162 | //		dest[index] = source[index];
163 | // No synchronize.
164 | template<int NT, int VT, typename InputIt, typename OutputIt>
165 | MGPU_DEVICE void DeviceGlobalToGlobal(int count, InputIt source, int tid,
166 | 	OutputIt dest, bool sync = false);
167 | 
168 | // Transponse VT elements in NT threads (x) into thread-order registers (y)
169 | // using only NT * VT / 2 elements of shared memory.
170 | template<int NT, int VT, typename T>
171 | MGPU_DEVICE void HalfSmemTranspose(const T* x, int tid, T* shared, T* y);
172 | 
173 | // For 0 <= i < VT:
174 | //		index = NT * i + tid;
175 | //		if(index < count)
176 | //			gather = indices[index];
177 | //			reg[i] = data[gather];
178 | // Synchronize after load.
179 | template<int NT, int VT, typename InputIt, typename T>
180 | MGPU_DEVICE void DeviceGather(int count, InputIt data, int indices[VT],
181 | 	int tid, T* reg, bool sync = true);
182 | 
183 | template<int NT, int VT, typename InputIt, typename T>
184 | MGPU_DEVICE void DeviceGatherDefault(int count, InputIt data, int indices[VT],
185 | 	int tid, T* reg, T identity, bool sync = true);
186 | 
187 | // For 0 <= i < VT:
188 | //		index = NT * i + tid;
189 | //		if(index < count)
190 | //			scatter = indices[index];
191 | //			data[scatter] = reg[i];
192 | // Synchronize after store.
193 | template<int NT, int VT, typename T, typename OutputIt>
194 | MGPU_DEVICE void DeviceScatter(int count, const T* reg, int tid,
195 | 	int indices[VT], OutputIt data, bool sync = true);
196 | 
197 | // For 0 <= i < VT:
198 | //		shared[VT * tid + i] = threadReg[i];
199 | // Synchronize after store.
200 | // Note this function moves data in THREAD ORDER.
201 | // (DeviceRegToShared moves data in STRIDED ORDER).
202 | template<int VT, typename T>
203 | MGPU_DEVICE void DeviceThreadToShared(const T* threadReg, int tid, T* shared,
204 | 	bool sync = true);
205 | 
206 | // For 0 <= i < VT:
207 | //		threadReg[i] = shared[VT * tid + i];
208 | // Synchronize after load.
209 | // Note this function moves data in THREAD ORDER.
210 | // (DeviceSharedToReg moves data in STRIDED ORDER).
211 | template<int VT, typename T>
212 | MGPU_DEVICE void DeviceSharedToThread(const T* shared, int tid, T* threadReg,
213 | 	bool sync = true);
214 | 
215 | // For 0 <= index < aCount:
216 | //		shared[index] = a_global[index];
217 | // For 0 <= index < bCount:
218 | //		shared[aCount + index] = b_global[index];
219 | // VT0 is the lower-bound for predication-free execution:
220 | //		If count >= NT * VT0, a predication-free branch is taken.
221 | // VT1 is the upper-bound for loads:
222 | //		NT * VT1 must >= aCount + bCount.
223 | 
224 | template<int NT, int VT0, int VT1, typename T>
225 | MGPU_DEVICE void DeviceLoad2ToReg(const T* a_global, int aCount,
226 | 	const T* b_global, int bCount, int tid, T* reg, bool sync = false);
227 | 
228 | template<int NT, int VT0, int VT1, typename T>
229 | MGPU_DEVICE void DeviceLoad2ToShared(const T* a_global, int aCount,
230 | 	const T* b_global, int bCount, int tid, T* shared, bool sync = true);
231 | 
232 | template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
233 | 	typename T>
234 | MGPU_DEVICE void DeviceLoad2ToReg(InputIt1 a_global, int aCount,
235 | 	InputIt2 b_global, int bCount, int tid, T* reg, bool sync = false);
236 | 
237 | template<int NT, int VT0, int VT1, typename InputIt1, typename InputIt2,
238 | 	typename T>
239 | MGPU_DEVICE void DeviceLoad2ToShared(InputIt1 a_global, int aCount,
240 | 	InputIt2 b_global, int bCount, int tid, T* shared, bool sync = true);
241 | 
242 | // For 0 <= i < VT
243 | //		index = NT * i + tid;
244 | //		if(index < count)
245 | //			gather = indices_shared[index];
246 | //			dest_global[index] = data_global[gather];
247 | // Synchronize after load.
248 | template<int NT, int VT, typename InputIt, typename OutputIt>
249 | MGPU_DEVICE void DeviceGatherGlobalToGlobal(int count, InputIt data_global,
250 | 	const int* indices_shared, int tid, OutputIt dest_global,
251 | 	bool sync = true);
252 | 
253 | // For 0 <= i < VT
254 | //		index = NT * i + tid
255 | //		if(index < count)
256 | //			gather = indices[index];
257 | //			if(gather < aCount) data = a_global[gather];
258 | //			else data = b_global[gather - aCount];
259 | //			dest_global[index] = data;
260 | // Synchronize after load.
261 | template<int NT, int VT, typename InputIt1, typename InputIt2,
262 | 	typename T>
263 | MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, InputIt1 a_global,
264 | 	InputIt2 b_global, int bStart, const int* indices, int tid,
265 | 	T* reg, bool sync = false);
266 | 
267 | template<int NT, int VT, typename InputIt1, typename InputIt2,
268 | 	typename OutputIt>
269 | MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, InputIt1 a_global,
270 | 	InputIt2 b_global, int bStart, const int* indices_shared, int tid,
271 | 	OutputIt dest_global, bool sync = true);
272 | 
273 | template<int NT, int VT, typename T>
274 | MGPU_DEVICE void DeviceTransferMergeValuesReg(int count, const T* a_global,
275 | 	const T* b_global, int bStart, const int* indices, int tid,
276 | 	T* reg, bool sync = false);
277 | 
278 | template<int NT, int VT, typename T, typename OutputIt>
279 | MGPU_DEVICE void DeviceTransferMergeValuesShared(int count, const T* a_global,
280 | 	const T* b_global, int bStart, const int* indices_shared, int tid,
281 | 	OutputIt dest_global, bool sync = true);
282 | 
283 | 
284 | 
285 | } // namespace mgpu
286 | 
287 | 
288 | #include "device/loadstore.cuh"
289 | #include "device/ctasegscan.cuh"
290 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/include/contrib/moderngpu/include/mgpuenums.h:
--------------------------------------------------------------------------------
 1 | /******************************************************************************
 2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
 3 |  * 
 4 |  * Redistribution and use in source and binary forms, with or without
 5 |  * modification, are permitted provided that the following conditions are met:
 6 |  *     * Redistributions of source code must retain the above copyright
 7 |  *       notice, this list of conditions and the following disclaimer.
 8 |  *     * Redistributions in binary form must reproduce the above copyright
 9 |  *       notice, this list of conditions and the following disclaimer in the
10 |  *       documentation and/or other materials provided with the distribution.
11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
12 |  *       names of its contributors may be used to endorse or promote products
13 |  *       derived from this software without specific prior written permission.
14 |  * 
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 |  *
26 |  ******************************************************************************/
27 | 
28 | /******************************************************************************
29 |  *
30 |  * Code and text by Sean Baxter, NVIDIA Research
31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
32 |  *
33 |  ******************************************************************************/
34 | 
35 | #pragma once 
36 | 
37 | namespace mgpu {
38 | 
39 | enum MgpuBounds {
40 | 	MgpuBoundsLower,
41 | 	MgpuBoundsUpper
42 | };
43 | 
44 | enum MgpuScanType {
45 | 	MgpuScanTypeExc,
46 | 	MgpuScanTypeInc
47 | };
48 | 
49 | enum MgpuSearchType {
50 | 	MgpuSearchTypeNone,
51 | 	MgpuSearchTypeIndex,
52 | 	MgpuSearchTypeMatch,
53 | 	MgpuSearchTypeIndexMatch
54 | };
55 | 
56 | enum MgpuJoinKind {
57 | 	MgpuJoinKindInner,
58 | 	MgpuJoinKindLeft,
59 | 	MgpuJoinKindRight,
60 | 	MgpuJoinKindOuter
61 | };
62 | 
63 | enum MgpuSetOp {
64 | 	MgpuSetOpIntersection,
65 | 	MgpuSetOpUnion,
66 | 	MgpuSetOpDiff,
67 | 	MgpuSetOpSymDiff
68 | };
69 | 
70 | } // namespace mgpu
71 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/include/contrib/moderngpu/include/util/static.h:
--------------------------------------------------------------------------------
  1 | /******************************************************************************
  2 |  * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
  3 |  * 
  4 |  * Redistribution and use in source and binary forms, with or without
  5 |  * modification, are permitted provided that the following conditions are met:
  6 |  *     * Redistributions of source code must retain the above copyright
  7 |  *       notice, this list of conditions and the following disclaimer.
  8 |  *     * Redistributions in binary form must reproduce the above copyright
  9 |  *       notice, this list of conditions and the following disclaimer in the
 10 |  *       documentation and/or other materials provided with the distribution.
 11 |  *     * Neither the name of the NVIDIA CORPORATION nor the
 12 |  *       names of its contributors may be used to endorse or promote products
 13 |  *       derived from this software without specific prior written permission.
 14 |  * 
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 16 |  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 18 |  * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
 19 |  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 20 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 21 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 22 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 24 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 25 |  *
 26 |  ******************************************************************************/
 27 | 
 28 | /******************************************************************************
 29 |  *
 30 |  * Code and text by Sean Baxter, NVIDIA Research
 31 |  * See http://nvlabs.github.io/moderngpu for repository and documentation.
 32 |  *
 33 |  ******************************************************************************/
 34 | 
 35 | #pragma once
 36 | 
 37 | #include <functional>
 38 | #include <iterator>
 39 | #include <cfloat>
 40 | #include <typeinfo>
 41 | #include <vector>
 42 | #include <list>
 43 | #include <map>
 44 | #include <algorithm>
 45 | #include <cassert>
 46 | #include <memory>
 47 | #include <cmath>
 48 | #include <cstdio>
 49 | #include <cstdlib>
 50 | 
 51 | #ifndef MGPU_MIN
 52 | #define MGPU_MIN(x, y) (((x) <= (y)) ? (x) : (y))
 53 | #define MGPU_MAX(x, y) (((x) >= (y)) ? (x) : (y))
 54 | #define MGPU_MAX0(x) (((x) >= 0) ? (x) : 0)
 55 | #define MGPU_ABS(x) (((x) >= 0) ? (x) : (-x))
 56 | 
 57 | #define MGPU_DIV_UP(x, y) (((x) + (y) - 1) / (y))
 58 | #define MGPU_DIV_ROUND(x, y) (((x) + (y) / 2) / (y))
 59 | #define MGPU_ROUND_UP(x, y) ((y) * MGPU_DIV_UP(x, y))
 60 | #define MGPU_SHIFT_DIV_UP(x, y) (((x) + ((1<< (y)) - 1))>> y)
 61 | #define MGPU_ROUND_UP_POW2(x, y) (((x) + (y) - 1) & ~((y) - 1))
 62 | #define MGPU_ROUND_DOWN_POW2(x, y) ((x) & ~((y) - 1))
 63 | #define MGPU_IS_POW_2(x) (0 == ((x) & ((x) - 1)))
 64 | 
 65 | #endif // MGPU_MIN
 66 | 
 67 | namespace mgpu {
 68 | 
 69 | 
 70 | typedef unsigned char byte;
 71 | 
 72 | typedef unsigned int uint;
 73 | typedef signed short int16;
 74 | 
 75 | typedef unsigned short ushort;
 76 | typedef unsigned short uint16;
 77 | 
 78 | typedef long long int64;
 79 | typedef unsigned long long uint64;
 80 | 
 81 | // IsPow2<X>::value is true if X is a power of 2.
 82 | template<int X> struct sIsPow2 {
 83 | 	enum { value = 0 == (X & (X - 1)) };
 84 | };
 85 | 
 86 | // Finds the base-2 logarithm of X. value is -1 if X is not a power of 2.
 87 | template<int X, bool roundUp = true> struct sLogPow2 { 
 88 | 	enum { extra = sIsPow2<X>::value ? 0 : (roundUp ? 1 : 0) };
 89 | 	enum { inner = sLogPow2<X / 2>::inner + 1 };
 90 | 	enum { value = inner + extra };
 91 | };
 92 | template<bool roundUp> struct sLogPow2<0, roundUp> {
 93 | 	enum { inner = 0 };
 94 | 	enum { value = 0 };
 95 | };
 96 | template<bool roundUp> struct sLogPow2<1, roundUp> { 
 97 | 	enum { inner = 0 };
 98 | 	enum { value = 0 };
 99 | };
100 | 
101 | template<int X, int Y>
102 | struct sDivUp {
103 | 	enum { value = (X + Y - 1) / Y };
104 | };
105 | 
106 | template<int count, int levels> struct sDiv2RoundUp {
107 | 	enum { value = sDiv2RoundUp<sDivUp<count, 2>::value, levels - 1>::value };
108 | };
109 | template<int count> struct sDiv2RoundUp<count, 0> {
110 | 	enum { value = count };
111 | };
112 | 
113 | template<int X, int Y>
114 | struct sDivSafe {
115 | 	enum { value = X / Y };
116 | };
117 | template<int X>
118 | struct sDivSafe<X, 0> {
119 | 	enum { value = 0 };
120 | };
121 | 
122 | template<int X, int Y>
123 | struct sRoundUp {
124 | 	enum { rem = X % Y };
125 | 	enum { value = X + (rem ? (Y - rem) : 0) };
126 | };
127 | 
128 | template<int X, int Y>
129 | struct sRoundDown {
130 | 	enum { rem = X % Y };
131 | 	enum { value = X - rem };
132 | };
133 | 
134 | // IntegerDiv is a template for avoiding divisions by zero in template 
135 | // evaluation. Templates always evaluate both b and c in an expression like
136 | // a ? b : c, and will error if either rhs contains an illegal expression,
137 | // even if the ternary is explictly designed to guard against that.
138 | template<int X, int Y>
139 | struct sIntegerDiv {
140 | 	enum { value = X / (Y ? Y : (X + 1)) };
141 | };
142 | 
143 | template<int X, int Y>
144 | struct sMax {
145 | 	enum { value = (X >= Y) ? X : Y };
146 | };
147 | template<int X, int Y>
148 | struct sMin {
149 | 	enum { value = (X <= Y) ? X : Y };
150 | };
151 | 
152 | template<int X>
153 | struct sAbs {
154 | 	enum { value = (X >= 0) ? X : -X };
155 | };
156 | 
157 | 
158 | // Finds the number of powers of 2 in the prime factorization of X.
159 | template<int X, int LSB = 1 & X> struct sNumFactorsOf2 {
160 | 	enum { shifted = X >> 1 };
161 | 	enum { value = 1 + sNumFactorsOf2<shifted>::value };
162 | };
163 | template<int X> struct sNumFactorsOf2<X, 1> {
164 | 	enum { value = 0 };
165 | };
166 | 
167 | // Returns the divisor for a conflict-free transpose.
168 | template<int X, int NumBanks = 32> struct sBankConflictDivisor {
169 | 	enum { value = 
170 | 		(1 & X) ? 0 : 
171 | 		(sIsPow2<X>::value ? NumBanks :
172 | 		(1<< sNumFactorsOf2<X>::value)) }; 
173 | 	enum { log_value = sLogPow2<value>::value };
174 | };
175 | 
176 | template<int NT, int X, int NumBanks = 32> struct sConflictFreeStorage {
177 | 	enum { count = NT * X };
178 | 	enum { divisor = sBankConflictDivisor<X, NumBanks>::value };
179 | 	enum { padding = sDivSafe<count, divisor>::value };
180 | 	enum { value = count + padding };
181 | };
182 | 
183 | } // namespace mgpu
184 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/include/ctc.h:
--------------------------------------------------------------------------------
  1 | /** \file ctc.h
  2 |  * Contains a simple C interface to call fast CPU and GPU based computation
  3 |  * of the CTC loss.
  4 |  */
  5 | 
  6 | #pragma once
  7 | 
  8 | #ifdef __cplusplus
  9 | #include <cstddef>
 10 | extern "C" {
 11 | #endif
 12 | 
 13 | //forward declare of CUDA typedef to avoid needing to pull in CUDA headers
 14 | typedef struct CUstream_st* CUstream;
 15 | 
 16 | typedef enum {
 17 |     CTC_STATUS_SUCCESS = 0,
 18 |     CTC_STATUS_MEMOPS_FAILED = 1,
 19 |     CTC_STATUS_INVALID_VALUE = 2,
 20 |     CTC_STATUS_EXECUTION_FAILED = 3,
 21 |     CTC_STATUS_UNKNOWN_ERROR = 4
 22 | } ctcStatus_t;
 23 | 
 24 | /** Returns a string containing a description of status that was passed in
 25 |  *  \param[in] status identifies which string should be returned
 26 |  *  \return C style string containing the text description
 27 |  *  */
 28 | const char* ctcGetStatusString(ctcStatus_t status);
 29 | 
 30 | typedef enum {
 31 |     CTC_CPU = 0,
 32 |     CTC_GPU = 1
 33 | } ctcComputeLocation;
 34 | 
 35 | /** Structure used to indicate where the ctc calculation should take place
 36 |  *  and parameters associated with that place.
 37 |  *  Cpu execution can specify the maximum number of threads that can be used
 38 |  *  Gpu execution can specify which stream the kernels should be launched in.
 39 |  *  */
 40 | struct ctcComputeInfo {
 41 |     ctcComputeLocation loc;
 42 |     union {
 43 |         unsigned int num_threads;
 44 |         CUstream stream;
 45 |     };
 46 | };
 47 | 
 48 | /** Compute the connectionist temporal classification loss between a sequence
 49 |  *  of probabilities and a ground truth labeling.  Optionally compute the
 50 |  *  gradient with respect to the inputs.
 51 |  * \param [in] activations pointer to the activations in either CPU or GPU
 52 |  *             addressable memory, depending on info.  We assume a fixed
 53 |  *             memory layout for this 3 dimensional tensor, which has dimension
 54 |  *             (t, n, p), where t is the time index, n is the minibatch index,
 55 |  *             and p indexes over probabilities of each symbol in the alphabet.
 56 |  *             The memory layout is (t, n, p) in C order (slowest to fastest changing
 57 |  *             index, aka row-major), or (p, n, t) in Fortran order (fastest to slowest
 58 |  *             changing index, aka column-major). We also assume strides are equal to
 59 |  *             dimensions - there is no padding between dimensions.
 60 |  *             More precisely, element (t, n, p), for a problem with mini_batch examples
 61 |  *             in the mini batch, and alphabet_size symbols in the alphabet, is located at:
 62 |  *             activations[(t * mini_batch + n) * alphabet_size + p]
 63 |  * \param [out] gradients if not NULL, then gradients are computed.  Should be
 64 |  *              allocated in the same memory space as probs and memory
 65 |  *              ordering is identical.
 66 |  * \param [in]  flat_labels Always in CPU memory.  A concatenation
 67 |  *              of all the labels for the minibatch.
 68 |  * \param [in]  label_lengths Always in CPU memory. The length of each label
 69 |  *              for each example in the minibatch.
 70 |  * \param [in]  input_lengths Always in CPU memory.  The number of time steps
 71 |  *              for each sequence in the minibatch.
 72 |  * \param [in]  alphabet_size The number of possible output symbols.  There
 73 |  *              should be this many probabilities for each time step.
 74 |  * \param [in]  mini_batch How many examples in a minibatch.
 75 |  * \param [out] costs Always in CPU memory.  The cost of each example in the
 76 |  *              minibatch.
 77 |  * \param [in,out] workspace In same memory space as probs. Should be of
 78 |  *                 size requested by get_workspace_size.
 79 |  * \param [in]  ctcComputeInfo describes whether or not the execution should
 80 |  *              take place on the CPU or GPU, and by extension the location of
 81 |  *              the probs and grads pointers.  Can be used to set the
 82 |  *              number of threads for cpu execution or the stream for gpu
 83 |  *              execution.
 84 |  *
 85 |  *  \return Status information
 86 |  *
 87 |  * */
 88 | ctcStatus_t compute_ctc_loss(const float* const activations,
 89 |                              float* gradients,
 90 |                              const int* const flat_labels,
 91 |                              const int* const label_lengths,
 92 |                              const int* const input_lengths,
 93 |                              int alphabet_size,
 94 |                              int minibatch,
 95 |                              float *costs,
 96 |                              void *workspace,
 97 |                              ctcComputeInfo info);
 98 | 
 99 | // Simple wrappers to enable neon support
100 | int compute_ctc_loss_cpu(const float* const activations,
101 |                      float* gradients,
102 |                      const int* const flat_labels,
103 |                      const int* const label_lengths,
104 |                      const int* const input_lengths,
105 |                      int alphabet_size,
106 |                      int minibatch,
107 |                      float *costs,
108 |                      int num_threads);
109 | 
110 | #ifdef __CUDACC__
111 | int get_workspace_size_gpu(const int* const label_lengths,
112 |                        const int* const input_lengths,
113 |                        int alphabet_size, int minibatch,
114 |                        CUstream stream);
115 | 
116 | int compute_ctc_loss_gpu(const float* const activations,
117 |                      float* gradients,
118 |                      const int* const flat_labels,
119 |                      const int* const label_lengths,
120 |                      const int* const input_lengths,
121 |                      int alphabet_size,
122 |                      int minibatch,
123 |                      float *costs,
124 |                      void *workspace,
125 |                      CUstream stream);
126 | #endif
127 | 
128 | 
129 | /** For a given set of labels and minibatch size return the required workspace
130 |  *  size.  This will need to be allocated in the same memory space as your
131 |  *  probabilities.
132 |  * \param [in]  label_lengths Always in CPU memory. The length of each label
133 |  *              for each example in the minibatch.
134 |  * \param [in]  input_lengths Always in CPU memory.  The number of time steps
135 |  *              for each sequence in the minibatch.
136 |  * \param [in]  alphabet_size How many symbols in the alphabet or, equivalently,
137 |  *              the number of probabilities at each time step
138 |  * \param [in]  mini_batch How many examples in a minibatch.
139 |  * \param [in]  info struct describing the location (cpu/gpu) and associated
140 |  *              parameters of execution
141 |  * \param [out] size_bytes is pointer to a scalar where the memory
142 |  *              requirement in bytes will be placed. This memory should be allocated
143 |  *              at the same place, CPU or GPU, that the probs are in
144 |  *
145 |  *  \return Status information
146 |  **/
147 | ctcStatus_t get_workspace_size(const int* const label_lengths,
148 |                                const int* const input_lengths,
149 |                                int alphabet_size, int minibatch,
150 |                                ctcComputeInfo info,
151 |                                size_t* size_bytes);
152 | 
153 | 
154 | #ifdef __cplusplus
155 | }
156 | #endif
157 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/include/detail/ctc_helper.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <limits>
 4 | #include <algorithm>
 5 | #include <cmath>
 6 | 
 7 | #include "hostdevice.h"
 8 | 
 9 | namespace ctc_helper {
10 | 
11 | static const int BLANK = 0;
12 | static const float threshold = 1e-1;
13 | 
14 | template<typename T>
15 | HOSTDEVICE
16 | T neg_inf() { return -T(INFINITY); }
17 | 
18 | inline int div_up(int x, int y) {
19 |     return (x + y - 1) / y;
20 | }
21 | 
22 | template <typename Arg, typename Res = Arg> struct maximum {
23 |     HOSTDEVICE
24 |     Res operator()(const Arg& x, const Arg& y) const {
25 |         return x < y ? y : x;
26 |     }
27 | };
28 | 
29 | template <typename Arg, typename Res = Arg> struct add {
30 |     HOSTDEVICE
31 |     Res operator()(const Arg& x, const Arg& y) const {
32 |         return x + y;
33 |     }
34 | };
35 | 
36 | template <typename Arg, typename Res = Arg> struct identity {
37 |     HOSTDEVICE Res operator()(const Arg& x) const {return Res(x);}
38 | };
39 | 
40 | template <typename Arg, typename Res = Arg> struct negate {
41 |     HOSTDEVICE Res operator()(const Arg& x) const {return Res(-x);}
42 | };
43 | 
44 | template <typename Arg, typename Res = Arg> struct exponential {
45 |     HOSTDEVICE Res operator()(const Arg& x) const {return std::exp(x);}
46 | };
47 | 
48 | template<typename Arg1, typename Arg2 = Arg1, typename Res=Arg1>
49 | struct log_plus {
50 |     typedef Res result_type;
51 |     HOSTDEVICE
52 |     Res operator()(const Arg1& p1, const Arg2& p2) {
53 |         if (p1 == neg_inf<Arg1>())
54 |             return p2;
55 |         if (p2 == neg_inf<Arg2>())
56 |             return p1;
57 |         Res result = log1p(exp(-fabs(p1 - p2))) + maximum<Res>()(p1, p2);
58 |         return result;
59 |     }
60 | };
61 | 
62 | }
63 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/include/detail/hostdevice.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #ifdef __CUDACC__
4 |     #define HOSTDEVICE __host__ __device__
5 | #else
6 |     #define HOSTDEVICE
7 | #endif
8 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/include/detail/reduce.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | ctcStatus_t reduce_negate(const float* input, float* output, int rows, int cols, bool axis, cudaStream_t stream);
4 | ctcStatus_t reduce_exp(const float* input, float* output, int rows, int cols, bool axis, cudaStream_t stream);
5 | ctcStatus_t reduce_max(const float* input, float* output, int rows, int cols, bool axis, cudaStream_t stream);
6 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/python/__init__.py:
--------------------------------------------------------------------------------
1 | from ctc import cpu_ctc_np, cpu_ctc_th
2 | del ctc
3 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/python/ctc.py:
--------------------------------------------------------------------------------
  1 | import platform
  2 | import os
  3 | import numpy as np
  4 | import numpy.ctypeslib as npct
  5 | import ctypes
  6 | import ctypes.util
  7 | 
  8 | import theano
  9 | import theano.tensor as T
 10 | from theano.gradient import grad_undefined
 11 | 
 12 | if platform.system() == "Darwin":
 13 |     ext = "dylib"
 14 | elif platform.system() == "Linux":
 15 |     ext = "so"
 16 | else:
 17 |     raise Exception("Unsupported platform: {}".format(platform.system()))
 18 | libwarpctc = npct.load_library(os.path.join(os.path.dirname(__file__), "../build/libwarpctc.{}".format(ext)), "")
 19 | 
 20 | libwarpctc.cpu_ctc.restype = None
 21 | libwarpctc.cpu_ctc.argtypes = [
 22 |         npct.ndpointer(dtype=np.float32, ndim=3),
 23 |         npct.ndpointer(dtype=np.float32, ndim=3),
 24 |         npct.ndpointer(dtype=np.int32, ndim=1),
 25 |         npct.ndpointer(dtype=np.int32, ndim=1),
 26 |         npct.ndpointer(dtype=np.int32, ndim=1),
 27 |         ctypes.c_int,
 28 |         ctypes.c_int,
 29 |         npct.ndpointer(dtype=np.float32, ndim=1),
 30 |         ctypes.c_int]
 31 | 
 32 | def cpu_ctc_np(acts, act_lens, labels, label_lens):
 33 |     """
 34 |     acts: 3-d numpy float array, same as c++ bindings
 35 |     act_lens: 1-d int array of input length of each example
 36 |     labels: list of 1-d int array for each example in minibatch
 37 |     label_lens: 1-d int array of label length of each example
 38 |     """
 39 |     # make sure correct types
 40 |     acts = np.array(acts, dtype=np.float32)
 41 |     act_lens = np.array(act_lens, dtype=np.int32)
 42 |     labels = np.array(labels, dtype=np.int32)
 43 |     label_lens = np.array(label_lens, dtype=np.int32)
 44 | 
 45 |     # C needs sizes
 46 |     alphabet_size = acts.shape[2]
 47 |     minibatch = acts.shape[1]
 48 | 
 49 |     # create return variables
 50 |     grads = np.zeros_like(acts, dtype=np.float32)
 51 |     cost = np.zeros((minibatch,), dtype=np.float32)
 52 | 
 53 |     # compute
 54 |     libwarpctc.cpu_ctc(acts, grads, labels, label_lens, act_lens, alphabet_size, minibatch, cost, 1)
 55 |     return cost, grads
 56 | 
 57 | class CPUCTCGrad(theano.Op):
 58 |     # Properties attribute
 59 |     __props__ = ()
 60 | 
 61 |     def make_node(self, *inputs):
 62 |         inputs = map(theano.tensor.as_tensor_variable, inputs)
 63 |         # add checks here for types and numdims of all inputs
 64 |         return theano.Apply(self, inputs, [T.ftensor3()])
 65 | 
 66 |     def perform(self, node, inputs, outputs):
 67 |         inputs[0] = inputs[0].astype(np.float32)
 68 |         inputs[1] = inputs[1].astype(np.int32)
 69 |         inputs[2] = inputs[2].astype(np.int32)
 70 |         inputs[3] = inputs[3].astype(np.int32)
 71 |         cost, gradients = cpu_ctc_np(*inputs)
 72 |         outputs[0][0] = gradients
 73 | 
 74 | class CPUCTC(theano.Op):
 75 |     # Properties attribute
 76 |     __props__ = ()
 77 | 
 78 |     def make_node(self, *inputs):
 79 |         inputs = map(theano.tensor.as_tensor_variable, inputs)
 80 |         # add checks here for types and numdims of all inputs
 81 |         return theano.Apply(self, inputs, [T.fvector()])
 82 | 
 83 |     def perform(self, node, inputs, outputs):
 84 |         inputs[0] = inputs[0].astype(np.float32)
 85 |         inputs[1] = inputs[1].astype(np.int32)
 86 |         inputs[2] = inputs[2].astype(np.int32)
 87 |         inputs[3] = inputs[3].astype(np.int32)
 88 |         cost, gradients = cpu_ctc_np(*inputs)
 89 |         outputs[0][0] = cost
 90 | 
 91 |     def grad(self, inputs, output_grads):
 92 |         gradients = CPUCTCGrad()(*inputs)
 93 |         return [gradients,
 94 |                 grad_undefined(self, 1, inputs[1]),
 95 |                 grad_undefined(self, 2, inputs[2]),
 96 |                 grad_undefined(self, 3, inputs[3])]
 97 | 
 98 | cpu_ctc_th = CPUCTC()
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/python/setup.py:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 | setup(name='ctc',
3 |       version='0.1',
4 |       packages=['ctc'],
5 |       package_dir={'ctc': '.'},
6 |       package_data={'ctc' : ['../build/libwarpctc.*']}
7 |       )
8 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/src/ctc_entrypoint.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstddef>
  2 | #include <iostream>
  3 | #include <algorithm>
  4 | 
  5 | #include <ctc.h>
  6 | 
  7 | #include "detail/cpu_ctc.h"
  8 | #ifdef __CUDACC__
  9 |     #include "detail/gpu_ctc.h"
 10 | #endif
 11 | 
 12 | 
 13 | extern "C" {
 14 | 
 15 | const char* ctcGetStatusString(ctcStatus_t status) {
 16 |     switch (status) {
 17 |     case CTC_STATUS_SUCCESS:
 18 |         return "no error";
 19 |     case CTC_STATUS_MEMOPS_FAILED:
 20 |         return "cuda memcpy or memset failed";
 21 |     case CTC_STATUS_INVALID_VALUE:
 22 |         return "invalid value";
 23 |     case CTC_STATUS_EXECUTION_FAILED:
 24 |         return "execution failed";
 25 | 
 26 |     case CTC_STATUS_UNKNOWN_ERROR:
 27 |     default:
 28 |         return "unknown error";
 29 | 
 30 |     }
 31 | 
 32 | }
 33 | 
 34 | inline void throw_on_error(ctcStatus_t status, const char* message) {
 35 |     if (status != CTC_STATUS_SUCCESS) {
 36 |         throw std::runtime_error(message + (", stat = " + 
 37 |                                             std::string(ctcGetStatusString(status))));
 38 |     }
 39 | }
 40 | 
 41 | 
 42 | ctcStatus_t compute_ctc_loss(const float* const activations,
 43 |                              float* gradients,
 44 |                              const int* const flat_labels,
 45 |                              const int* const label_lengths,
 46 |                              const int* const input_lengths,
 47 |                              int alphabet_size,
 48 |                              int minibatch,
 49 |                              float *costs,
 50 |                              void *workspace,
 51 |                              ctcComputeInfo info) {
 52 | 
 53 |     if (activations == nullptr ||
 54 |         flat_labels == nullptr ||
 55 |         label_lengths == nullptr ||
 56 |         input_lengths == nullptr ||
 57 |         costs == nullptr ||
 58 |         workspace == nullptr ||
 59 |         alphabet_size <= 0 ||
 60 |         minibatch <= 0)
 61 |         return CTC_STATUS_INVALID_VALUE;
 62 | 
 63 |     if (info.loc == CTC_CPU) {
 64 |         CpuCTC<float> ctc(alphabet_size, minibatch, workspace, info.num_threads);
 65 | 
 66 |         if (gradients != NULL)
 67 |             return ctc.cost_and_grad(activations, gradients,
 68 |                                      costs,
 69 |                                      flat_labels, label_lengths,
 70 |                                      input_lengths);
 71 |         else
 72 |             return ctc.score_forward(activations, costs, flat_labels,
 73 |                                      label_lengths, input_lengths);
 74 |     } else if (info.loc == CTC_GPU) {
 75 | #ifdef __CUDACC__
 76 |         GpuCTC<float> ctc(alphabet_size, minibatch, workspace, info.stream);
 77 | 
 78 |         if (gradients != NULL)
 79 |             return ctc.cost_and_grad(activations, gradients, costs,
 80 |                                      flat_labels, label_lengths,
 81 |                                      input_lengths);
 82 |         else
 83 |             return ctc.score_forward(activations, costs, flat_labels,
 84 |                                      label_lengths, input_lengths);
 85 | #else
 86 |         std::cerr << "GPU execution requested, but not compiled with GPU support" << std::endl;
 87 |         return CTC_STATUS_EXECUTION_FAILED;
 88 | #endif
 89 |     } else {
 90 |         return CTC_STATUS_INVALID_VALUE;
 91 |     }
 92 | }
 93 | 
 94 | 
 95 | ctcStatus_t get_workspace_size(const int* const label_lengths,
 96 |                                const int* const input_lengths,
 97 |                                int alphabet_size, int minibatch,
 98 |                                ctcComputeInfo info,
 99 |                                size_t* size_bytes)
100 | {
101 |     if (label_lengths == nullptr ||
102 |         input_lengths == nullptr ||
103 |         size_bytes == nullptr ||
104 |         alphabet_size <= 0 ||
105 |         minibatch <= 0)
106 |         return CTC_STATUS_INVALID_VALUE;
107 | 
108 |     // This is the max of all S and T for all examples in the minibatch.
109 |     int maxL = *std::max_element(label_lengths, label_lengths + minibatch);
110 |     int maxT = *std::max_element(input_lengths, input_lengths + minibatch);
111 | 
112 |     const int S = 2 * maxL + 1;
113 | 
114 |     *size_bytes = 0;
115 | 
116 |     if (info.loc == CTC_GPU) {
117 |         // GPU storage
118 |         //nll_forward, nll_backward
119 |         *size_bytes += 2 * sizeof(float) * minibatch;
120 | 
121 |         //repeats
122 |         *size_bytes += sizeof(int) * minibatch;
123 | 
124 |         //label offsets
125 |         *size_bytes += sizeof(int) * minibatch;
126 | 
127 |         //utt_length
128 |         *size_bytes += sizeof(int) * minibatch;
129 | 
130 |         //label lengths
131 |         *size_bytes += sizeof(int) * minibatch;
132 | 
133 |         //labels without blanks - overallocate for now
134 |         *size_bytes += sizeof(int) * maxL * minibatch;
135 | 
136 |         //labels with blanks
137 |         *size_bytes += sizeof(int) * S * minibatch;
138 | 
139 |         //alphas
140 |         *size_bytes += sizeof(float) * S * maxT * minibatch;
141 | 
142 |         //denoms
143 |         *size_bytes += sizeof(float) * maxT * minibatch;
144 | 
145 |         //probs (since we will pass in activations)
146 |         *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
147 | 
148 |     } else {
149 |         //cpu can eventually replace all minibatch with
150 |         //max number of concurrent threads if memory is
151 |         //really tight
152 | 
153 |         //per minibatch memory
154 |         size_t per_minibatch_bytes = 0;
155 | 
156 |         //output
157 |         per_minibatch_bytes += sizeof(float) * alphabet_size ;
158 | 
159 |         //alphas
160 |         per_minibatch_bytes += sizeof(float) * S * maxT;
161 | 
162 |         //betas
163 |         per_minibatch_bytes += sizeof(float) * S;
164 | 
165 |         //labels w/blanks, e_inc, s_inc
166 |         per_minibatch_bytes += 3 * sizeof(int) * S;
167 | 
168 |         *size_bytes = per_minibatch_bytes * minibatch;
169 | 
170 |         //probs
171 |         *size_bytes += sizeof(float) * alphabet_size * maxT * minibatch;
172 |     }
173 | 
174 |     return CTC_STATUS_SUCCESS;
175 | }
176 | 
177 | /*
178 | Simple wrappers for neon compatibility
179 | */
180 | #ifdef __CUDACC__
181 | int get_workspace_size_gpu(const int* const label_lengths,
182 |                        const int* const input_lengths,
183 |                        int alphabet_size, int minibatch,
184 |                        cudaStream_t stream) {
185 |     ctcComputeInfo info;
186 |     info.loc = CTC_GPU;
187 |     info.stream = stream;
188 | 
189 |     size_t size_bytes;
190 |     get_workspace_size(label_lengths, input_lengths, alphabet_size,
191 |                        minibatch, info, &size_bytes);
192 | 
193 |     return int(size_bytes);
194 | }
195 | 
196 | 
197 | int compute_ctc_loss_gpu(const float* const activations,
198 |                      float* gradients,
199 |                      const int* const flat_labels,
200 |                      const int* const label_lengths,
201 |                      const int* const input_lengths,
202 |                      int alphabet_size,
203 |                      int minibatch,
204 |                      float *costs,
205 |                      void *workspace,
206 |                      cudaStream_t stream) {
207 | 
208 |     ctcComputeInfo info;
209 |     info.loc = CTC_GPU;
210 |     info.stream = stream;
211 | 
212 |     ctcStatus_t status = compute_ctc_loss(activations,
213 | 					  gradients,
214 | 					  flat_labels,
215 | 					  label_lengths,
216 | 					  input_lengths,
217 | 					  alphabet_size,
218 | 					  minibatch,
219 | 					  costs,
220 | 					  workspace,
221 | 					  info);
222 | 
223 |     // Maybe call throw_on_error here?
224 |     return int(status);
225 | 
226 | }
227 | #endif
228 | 
229 | int compute_ctc_loss_cpu(const float* const activations,
230 |                      float* gradients,
231 |                      const int* const flat_labels,
232 |                      const int* const label_lengths,
233 |                      const int* const input_lengths,
234 |                      int alphabet_size,
235 |                      int minibatch,
236 |                      float *costs,
237 |                      int num_threads) {
238 |     ctcComputeInfo info;
239 |     info.loc = CTC_CPU;
240 |     info.num_threads = num_threads;
241 | 
242 |     size_t size_bytes;
243 |     get_workspace_size(label_lengths,
244 |                        input_lengths,
245 |                        alphabet_size,
246 |                        minibatch,
247 |                        info,
248 |                        &size_bytes);
249 | 
250 |     void* workspace = malloc(size_bytes);
251 | 
252 |     ctcStatus_t status = compute_ctc_loss(activations,
253 |                                           gradients,
254 |                                           flat_labels,
255 |                                           label_lengths,
256 |                                           input_lengths,
257 |                                           alphabet_size,
258 |                                           minibatch,
259 |                                           costs,
260 |                                           workspace,
261 |                                           info);
262 |     free(workspace);
263 | 
264 |     // Maybe call throw_on_error here?
265 |     return int(status);
266 | }
267 | }
268 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/src/ctc_entrypoint.cu:
--------------------------------------------------------------------------------
1 | ctc_entrypoint.cpp


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/src/reduce.cu:
--------------------------------------------------------------------------------
  1 | // Includes, system
  2 | // #include <stdio.h>
  3 | // #include <stdlib.h>
  4 | 
  5 | // Includes, cuda
  6 | // #include <cuda_runtime.h>
  7 | // #include <cublas_v2.h>
  8 | 
  9 | // Includes, cuda helper functions
 10 | // #include <helper_cuda.h>
 11 | 
 12 | // For the functors
 13 | #include "detail/ctc_helper.h"
 14 | #include "ctc.h"
 15 | 
 16 | const int warp_size = 32;
 17 | 
 18 | template<int NT, typename T, typename Rop>
 19 | struct CTAReduce;
 20 | 
 21 | template<int NT, typename T, typename Rop>
 22 | struct CTAReduce {
 23 |     enum { Size = NT, Capacity = NT };
 24 |     struct Storage { T shared[Capacity]; };
 25 | 
 26 |     __device__ static T reduce(int tid, T x, Storage& storage, int count, Rop g) {
 27 |         T* s = storage.shared;
 28 |         s[tid] = x;
 29 |         __syncthreads();
 30 | 
 31 |         // Fold the data in half with each pass.
 32 | #pragma unroll
 33 |         for(int offset = NT / 2; offset >= warp_size; offset /= 2) {
 34 |             if(tid + offset < count && tid < offset) {
 35 |                 // Read from the right half and store to the left half.
 36 |                 x = g(x, s[offset + tid]);
 37 |                 s[tid] = x;
 38 |             }
 39 |             __syncthreads();
 40 |         }
 41 | 
 42 |         T shuff;
 43 |         for (int offset = warp_size / 2; offset > 0; offset /= 2) {
 44 |             shuff = __shfl_down(x, offset);
 45 |             if (tid + offset < count && tid < offset)
 46 |                 x = g(x, shuff);
 47 |         }
 48 |         return x;
 49 |     }
 50 | };
 51 | 
 52 | template <int NT, typename Iop, typename Rop, typename T>
 53 | __global__ void reduce_rows(Iop f, Rop g, const T* input, T* output,
 54 |                             int num_rows, int num_cols) {
 55 | 
 56 |     typedef CTAReduce<NT, T, Rop> R;
 57 |     __shared__ typename R::Storage storage;
 58 | 
 59 |     int tid = threadIdx.x;
 60 |     int idx = tid;
 61 |     int col = blockIdx.x;
 62 |     T curr;
 63 | 
 64 |     // Each block works on a column
 65 |     if (idx < num_rows)
 66 |         curr = f(input[idx + col*num_rows]);
 67 |     idx += NT;
 68 | 
 69 | 
 70 |     while (idx < num_rows) {
 71 |         curr += f(input[idx + col*num_rows]);
 72 |         idx += NT;
 73 |     }
 74 | 
 75 |     // Sum thread-totals over the CTA.
 76 |     curr = R::reduce(tid, curr, storage, num_rows, g);
 77 | 
 78 |     // Store result in out
 79 |     if (tid == 0)
 80 |         output[col] = curr;
 81 | }
 82 | 
 83 | template <int NT, typename Iop, typename Rop, typename T>
 84 | __global__ void reduce_cols(Iop f, Rop g, const T* input, T* output,
 85 |                             int num_rows, int num_cols) {
 86 | 
 87 |     __shared__ T s[NT];
 88 | 
 89 |     int warps_per_block = NT / warp_size;
 90 |     int row = blockDim.x * blockIdx.x + threadIdx.x;
 91 |     int col = threadIdx.y;
 92 |     T curr;
 93 | 
 94 |     if (row < num_rows && col < num_cols) {
 95 |         curr = f(input[row + col*num_rows]);
 96 |         col += blockDim.y;
 97 |         while (col < num_cols) {
 98 |             curr = g(curr, f(input[row + col*num_rows]));
 99 |             col += blockDim.y;
100 |         }
101 |     }
102 |     s[threadIdx.x * warps_per_block + threadIdx.y] = curr;
103 |     __syncthreads();
104 | 
105 |     // Reduce
106 |     if (threadIdx.y == 0 && row < num_rows) {
107 | #pragma unroll
108 |         for (int i = 1; i < warps_per_block && i < num_cols; ++i)
109 |             curr = g(curr, s[i + threadIdx.x * warps_per_block]);
110 |         output[row] = curr;
111 |     }
112 | }
113 | 
114 | struct ReduceHelper {
115 | 
116 |     template<typename T, typename Iof, typename Rof>
117 |     static void impl(Iof f, Rof g, const T* input, T* output, int num_rows, int num_cols, bool axis, cudaStream_t stream) {
118 | 
119 |         int grid_size;
120 | 
121 |         if (axis) {
122 |             grid_size = num_cols;
123 |             reduce_rows<128><<<grid_size, 128, 0, stream>>>
124 |                (f, g, input, output, num_rows, num_cols);
125 | 
126 |         } else {
127 |             dim3 tpb(warp_size, 128 / warp_size);
128 |             grid_size = (num_cols + warp_size - 1)/warp_size;
129 |             reduce_cols<128><<<grid_size, tpb, 0, stream>>>
130 |                 (f, g, input, output, num_rows, num_cols);
131 | 
132 |         }
133 |     }
134 | };
135 | 
136 | 
137 | template<typename T, typename Iof, typename  Rof>
138 | ctcStatus_t reduce(Iof f, Rof g, const T* input, T* output, int rows, int cols, bool axis, cudaStream_t stream) {
139 |     ReduceHelper::impl(f, g, input, output, rows, cols, axis, stream);
140 |     cudaStreamSynchronize(stream);
141 |     cudaError_t err = cudaGetLastError();
142 |     if (err != cudaSuccess)
143 |         return CTC_STATUS_EXECUTION_FAILED;
144 | 
145 |     return CTC_STATUS_SUCCESS;
146 | }
147 | 
148 | ctcStatus_t reduce_negate(const float *input, float *output, int rows, int cols, bool axis, cudaStream_t stream) {
149 |     return reduce(ctc_helper::negate<float>(), ctc_helper::add<float>(), input, output, rows, cols, axis, stream);
150 | }
151 | 
152 | ctcStatus_t reduce_exp(const float *input, float *output, int rows, int cols, bool axis, cudaStream_t stream) {
153 |     return reduce(ctc_helper::exponential<float>(), ctc_helper::add<float>(), input, output, rows, cols, axis, stream);
154 | }
155 | 
156 | ctcStatus_t reduce_max(const float *input, float *output, int rows, int cols, bool axis, cudaStream_t stream) {
157 |     return reduce(ctc_helper::identity<float>(), ctc_helper::maximum<float>(),input, output, rows, cols, axis, stream);
158 | }
159 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/tests/test.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdexcept>
 4 | #include <vector>
 5 | #include <random>
 6 | 
 7 | #include <ctc.h>
 8 | 
 9 | inline void throw_on_error(ctcStatus_t status, const char* message) {
10 |     if (status != CTC_STATUS_SUCCESS) {
11 |         throw std::runtime_error(message + (", stat = " + 
12 |                                             std::string(ctcGetStatusString(status))));
13 |     }
14 | }
15 | 
16 | #ifdef __CUDACC__
17 | #include <thrust/system_error.h>
18 | #include <thrust/system/cuda/error.h>
19 | 
20 | inline void throw_on_error(cudaError_t error, const char* message) {
21 |     if (error) {
22 |         throw thrust::system_error(error, thrust::cuda_category(), message);
23 |     }
24 | }
25 | 
26 | #endif
27 | 
28 | std::vector<float>
29 | genActs(int size) {
30 |     std::vector<float> arr(size);
31 |     std::mt19937 gen(0);
32 |     std::uniform_real_distribution<> dis(0, 1);
33 |     for(int i = 0; i < size; ++i)
34 |         arr[i] = dis(gen);
35 |     return arr;
36 | }
37 | 
38 | std::vector<int>
39 | genLabels(int alphabet_size, int L) {
40 |     std::vector<int> label(L);
41 | 
42 |     std::mt19937 gen(1);
43 |     std::uniform_int_distribution<> dis(1, alphabet_size - 1);
44 | 
45 |     for(int i = 0; i < L; ++i) {
46 |         label[i] = dis(gen);
47 |     }
48 |     // guarantee repeats for testing
49 |     if (L >= 3) {
50 |         label[L / 2] = label[L / 2 + 1];
51 |         label[L / 2 - 1] = label[L / 2];
52 |     }
53 |     return label;
54 | }
55 | 
56 | float rel_diff(const std::vector<float>& grad,
57 |                const std::vector<float>& num_grad) {
58 |     float diff = 0.;
59 |     float tot = 0.;
60 |     for(size_t idx = 0; idx < grad.size(); ++idx) {
61 |         diff += (grad[idx] - num_grad[idx]) * (grad[idx] - num_grad[idx]);
62 |         tot += grad[idx] * grad[idx];
63 |     }
64 | 
65 |     return diff / tot;
66 | }
67 | 
68 | // Numerically stable softmax for a minibatch of 1
69 | void softmax(const float* const acts,
70 |              int alphabet_size, int T,
71 |              float *probs) {
72 | 
73 |     for (int t = 0; t < T; ++t) {
74 | 
75 |         float max_activation =
76 |             -std::numeric_limits<float>::infinity();
77 | 
78 |         for (int a = 0; a < alphabet_size; ++a)
79 |             max_activation =
80 |                std::max(max_activation, acts[t*alphabet_size + a]);
81 | 
82 |         float denom = 0;
83 |         for (int a = 0; a < alphabet_size; ++a)
84 |             denom += std::exp(acts[t*alphabet_size + a] - max_activation);
85 | 
86 |         for (int a = 0; a < alphabet_size; ++a)
87 |             probs[t*alphabet_size + a] =
88 |                std::exp(acts[t*alphabet_size + a] - max_activation) / denom;
89 |     }
90 | }
91 | 


--------------------------------------------------------------------------------
/src/transforms/warp-ctc/tests/test_cpu.cpp:
--------------------------------------------------------------------------------
  1 | #include <cmath>
  2 | #include <random>
  3 | #include <tuple>
  4 | #include <vector>
  5 | 
  6 | #include <iostream>
  7 | 
  8 | #include <ctc.h>
  9 | 
 10 | #include "test.h"
 11 | 
 12 | bool small_test() {
 13 |     const int alphabet_size = 5;
 14 |     const int T = 2;
 15 | 
 16 |     std::vector<float> activations = {0.1, 0.6, 0.1, 0.1, 0.1,
 17 |                                       0.1, 0.1, 0.6, 0.1, 0.1};
 18 | 
 19 |     // Calculate the score analytically
 20 |     float expected_score;
 21 |     {
 22 |         std::vector<float> probs(activations.size());
 23 |         softmax(activations.data(), alphabet_size, T, probs.data());
 24 | 
 25 |         // Score calculation is specific to the given activations above
 26 |         expected_score = probs[1] * probs[7];
 27 |     }
 28 | 
 29 |     std::vector<int> labels = {1, 2};
 30 |     std::vector<int> label_lengths = {2};
 31 | 
 32 |     std::vector<int> lengths;
 33 |     lengths.push_back(T);
 34 | 
 35 |     float score;
 36 | 
 37 |     ctcComputeInfo info;
 38 |     info.loc = CTC_CPU;
 39 |     info.num_threads = 1;
 40 | 
 41 |     size_t cpu_alloc_bytes;
 42 |     throw_on_error(get_workspace_size(label_lengths.data(), lengths.data(),
 43 |                                       alphabet_size, lengths.size(), info,
 44 |                                       &cpu_alloc_bytes),
 45 |                    "Error: get_workspace_size in small_test");
 46 | 
 47 |     void* ctc_cpu_workspace = malloc(cpu_alloc_bytes);
 48 | 
 49 |     throw_on_error(compute_ctc_loss(activations.data(), NULL,
 50 |                                     labels.data(), label_lengths.data(),
 51 |                                     lengths.data(),
 52 |                                     alphabet_size,
 53 |                                     lengths.size(),
 54 |                                     &score,
 55 |                                     ctc_cpu_workspace,
 56 |                                     info),
 57 |                    "Error: compute_ctc_loss in small_test");
 58 | 
 59 |     free(ctc_cpu_workspace);
 60 |     score = std::exp(-score);
 61 |     const float eps = 1e-6;
 62 | 
 63 |     const float lb = expected_score - eps;
 64 |     const float ub = expected_score + eps;
 65 | 
 66 |     return (score > lb && score < ub);
 67 | }
 68 | 
 69 | bool inf_test() {
 70 |     const int alphabet_size = 15;
 71 |     const int T = 50;
 72 |     const int L = 10;
 73 |     const int minibatch = 1;
 74 | 
 75 |     std::vector<int> labels = genLabels(alphabet_size, L);
 76 |     labels[0] = 2;
 77 |     std::vector<int> label_lengths = {L};
 78 | 
 79 |     std::vector<float> acts = genActs(alphabet_size * T * minibatch);
 80 | 
 81 |     for (int i = 0; i < T; ++i)
 82 |         acts[alphabet_size * i + 2] = -1e30;
 83 | 
 84 |     std::vector<int> sizes;
 85 |     sizes.push_back(T);
 86 | 
 87 |     std::vector<float> grads(alphabet_size * T);
 88 | 
 89 |     float cost;
 90 | 
 91 |     ctcComputeInfo info;
 92 |     info.loc = CTC_CPU;
 93 |     info.num_threads = 1;
 94 | 
 95 |     size_t cpu_alloc_bytes;
 96 |     throw_on_error(get_workspace_size(label_lengths.data(), sizes.data(),
 97 |                                       alphabet_size, sizes.size(), info,
 98 |                                       &cpu_alloc_bytes),
 99 |                    "Error: get_workspace_size in inf_test");
100 | 
101 |     void* ctc_cpu_workspace = malloc(cpu_alloc_bytes);
102 | 
103 |     throw_on_error(compute_ctc_loss(acts.data(), grads.data(),
104 |                                     labels.data(), label_lengths.data(),
105 |                                     sizes.data(),
106 |                                     alphabet_size,
107 |                                     sizes.size(),
108 |                                     &cost,
109 |                                     ctc_cpu_workspace,
110 |                                     info),
111 |                    "Error: compute_ctc_loss in inf_test");
112 | 
113 |     free(ctc_cpu_workspace);
114 | 
115 |     bool status = true;
116 |     status &= std::isinf(cost);
117 | 
118 |     for (int i = 0; i < alphabet_size * T; ++i)
119 |         status &= !std::isnan(grads[i]);
120 |  
121 |     return status;
122 | }
123 | 
124 | float grad_check(int T, int alphabet_size,
125 |                   std::vector<float>& acts,
126 |                   const std::vector<std::vector<int>>& labels,
127 |                   const std::vector<int>& sizes) {
128 | 
129 |     float epsilon = 1e-2;
130 | 
131 |     const int minibatch = labels.size();
132 | 
133 |     std::vector<int> flat_labels;
134 |     std::vector<int> label_lengths;
135 |     for (const auto& l : labels) {
136 |         flat_labels.insert(flat_labels.end(), l.begin(), l.end());
137 |         label_lengths.push_back(l.size());
138 |     }
139 | 
140 |     std::vector<float> costs(minibatch);
141 | 
142 |     std::vector<float> grads(acts.size());
143 | 
144 |     ctcComputeInfo info;
145 |     info.loc = CTC_CPU;
146 |     info.num_threads = 1;
147 | 
148 |     size_t cpu_alloc_bytes;
149 |     throw_on_error(get_workspace_size(label_lengths.data(), sizes.data(),
150 |                                       alphabet_size, sizes.size(), info,
151 |                                       &cpu_alloc_bytes),
152 |                    "Error: get_workspace_size in grad_check");
153 | 
154 |     void* ctc_cpu_workspace = malloc(cpu_alloc_bytes);
155 | 
156 |     throw_on_error(compute_ctc_loss(acts.data(), grads.data(),
157 |                                     flat_labels.data(), label_lengths.data(),
158 |                                     sizes.data(),
159 |                                     alphabet_size,
160 |                                     minibatch,
161 |                                     costs.data(),
162 |                                     ctc_cpu_workspace,
163 |                                     info),
164 |                    "Error: compute_ctc_loss (0) in grad_check");
165 | 
166 |     float cost = std::accumulate(costs.begin(), costs.end(), 0.);
167 | 
168 |     std::vector<float> num_grad(grads.size());
169 | 
170 |     //perform 2nd order central differencing
171 |     for (int i = 0; i < T * alphabet_size * minibatch; ++i) {
172 | 
173 |         std::vector<float> costsP1(minibatch);
174 |         std::vector<float> costsP2(minibatch);
175 | 
176 |         acts[i] += epsilon;
177 |         throw_on_error(compute_ctc_loss(acts.data(), NULL,
178 |                                         flat_labels.data(), label_lengths.data(),
179 |                                         sizes.data(),
180 |                                         alphabet_size,
181 |                                         minibatch,
182 |                                         costsP1.data(),
183 |                                         ctc_cpu_workspace,
184 |                                         info),
185 |                        "Error: compute_ctc_loss (1) in grad_check");
186 | 
187 |         acts[i] -= 2 * epsilon;
188 |         throw_on_error(compute_ctc_loss(acts.data(), NULL,
189 |                                         flat_labels.data(), label_lengths.data(),
190 |                                         sizes.data(),
191 |                                         alphabet_size,
192 |                                         minibatch,
193 |                                         costsP2.data(),
194 |                                         ctc_cpu_workspace,
195 |                                         info),
196 |                        "Error: compute_ctc_loss (2) in grad_check");
197 | 
198 |         float costP1 = std::accumulate(costsP1.begin(), costsP1.end(), 0.);
199 |         float costP2 = std::accumulate(costsP2.begin(), costsP2.end(), 0.);
200 | 
201 |         acts[i] += epsilon;
202 |         num_grad[i] = (costP1 - costP2) / (2 * epsilon);
203 |     }
204 | 
205 |     free(ctc_cpu_workspace);
206 | 
207 |     float diff = rel_diff(grads, num_grad);
208 | 
209 |     return diff;
210 | }
211 | 
212 | bool run_tests() {
213 |     std::vector<std::tuple<int, int, int, int, float>> problem_sizes =
214 |        {std::make_tuple(20, 50, 15, 1, 1e-5),
215 |         std::make_tuple(5, 10, 5, 65, 1e-4)
216 |        };
217 | 
218 |     std::mt19937 gen(2);
219 | 
220 |     bool status = true;
221 |     for (auto problem : problem_sizes) {
222 |         int alphabet_size, T, L, minibatch;
223 |         float tol;
224 |         std::tie(alphabet_size, T, L, minibatch, tol) = problem;
225 | 
226 |         std::vector<float> acts = genActs(alphabet_size * T * minibatch);
227 | 
228 |         std::vector<std::vector<int>> labels;
229 |         std::vector<int> sizes;
230 |         for (int mb = 0; mb < minibatch; ++mb) {
231 |             int actual_length = L;
232 |             labels.push_back(genLabels(alphabet_size, actual_length));
233 |             sizes.push_back(T);
234 |         }
235 | 
236 |         float diff = grad_check(T, alphabet_size, acts, labels, sizes);
237 | 
238 |         status &= (diff < tol);
239 |     }
240 | 
241 |     return status;
242 | }
243 | 
244 | int main(void) {
245 |     std::cout << "Running CPU tests" << std::endl;
246 | 
247 |     bool status = true;
248 |     status &= small_test();
249 |     status &= inf_test();
250 |     status &= run_tests();
251 | 
252 |     if (status)
253 |         std::cout << "Tests pass" << std::endl;
254 |     else
255 |         std::cout << "Some or all tests fail" << std::endl;
256 | }
257 | 


--------------------------------------------------------------------------------