├── .gitignore
├── .gitmodules
├── CITATION.cff
├── LICENSE
├── NOTICE
├── README.md
├── analyzer
    ├── .gitignore
    ├── README.md
    ├── extract-models.sh
    ├── habitat
    │   ├── __init__.py
    │   ├── analysis
    │   │   ├── __init__.py
    │   │   ├── arguments.py
    │   │   ├── device.py
    │   │   ├── kernels.py
    │   │   ├── metrics.py
    │   │   ├── mlp
    │   │   │   ├── .gitignore
    │   │   │   ├── __init__.py
    │   │   │   ├── dataset.py
    │   │   │   ├── dataset_process.py
    │   │   │   ├── devices.csv
    │   │   │   ├── devices.py
    │   │   │   ├── mlp.py
    │   │   │   ├── saved_models
    │   │   │   │   └── .gitignore
    │   │   │   └── train.py
    │   │   ├── operation.py
    │   │   ├── predictor.py
    │   │   ├── run_time.py
    │   │   ├── trace.py
    │   │   └── wave_scaling
    │   │   │   ├── __init__.py
    │   │   │   ├── common.py
    │   │   │   ├── metadata.py
    │   │   │   ├── resimplified.py
    │   │   │   ├── roofline.py
    │   │   │   └── unified.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   ├── checksums
    │   │   ├── devices.yml
    │   │   └── verify.sh
    │   ├── profiling
    │   │   ├── __init__.py
    │   │   ├── autograd.py
    │   │   ├── backward.py
    │   │   ├── kernel.py
    │   │   ├── operation.py
    │   │   └── run_time.py
    │   ├── tracking
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── callable.py
    │   │   ├── hook_manager.py
    │   │   └── operation.py
    │   └── utils.py
    ├── install-dev.sh
    ├── pyproject.toml
    └── setup.py
├── cpp
    ├── .gitignore
    ├── CMakeLists.txt
    ├── README.md
    ├── cmake
    │   ├── FindCUPTI.cmake
    │   └── FindNVPerf.cmake
    ├── external
    │   ├── CMakeLists.txt
    │   └── cupti_profilerhost_util
    │   │   ├── CMakeLists.txt
    │   │   ├── include
    │   │       ├── c_util
    │   │       │   ├── FileOp.h
    │   │       │   └── ScopeExit.h
    │   │       └── profilerhost_util
    │   │       │   ├── Eval.h
    │   │       │   ├── List.h
    │   │       │   ├── Metric.h
    │   │       │   └── Parser.h
    │   │   └── src
    │   │       └── profilerhost_util
    │   │           ├── Eval.cpp
    │   │           ├── List.cpp
    │   │           └── Metric.cpp
    └── src
    │   ├── CMakeLists.txt
    │   ├── cuda
    │       ├── CMakeLists.txt
    │       ├── cuda_macros.h
    │       ├── cuda_occupancy.h
    │       ├── cupti_exceptions.cpp
    │       ├── cupti_exceptions.h
    │       ├── cupti_macros.h
    │       ├── cupti_manager.cpp
    │       ├── cupti_profiler.cpp
    │       ├── cupti_profiler.h
    │       ├── cupti_tracer.cpp
    │       ├── diagnostics.cu
    │       ├── diagnostics.h
    │       ├── habitat_cupti.h
    │       ├── kernel.cpp
    │       ├── kernel.h
    │       ├── legacy_cupti_profiler.cpp
    │       ├── legacy_cupti_profiler.h
    │       ├── metrics.h
    │       ├── new_cupti_profiler.cpp
    │       ├── new_cupti_profiler.h
    │       ├── sampled_measurement.h
    │       ├── utils-inl.h
    │       └── utils.h
    │   ├── device_info.cpp
    │   ├── frontend
    │       ├── CMakeLists.txt
    │       ├── model_bindings.cpp
    │       ├── model_bindings.h
    │       ├── profiler.cpp
    │       └── profiler.h
    │   └── habitat_cuda.cpp
├── docker
    ├── Dockerfile
    ├── README.md
    ├── create-user.sh
    ├── setup.sh
    ├── start.sh
    └── vars.sh
├── experiments
    ├── .gitignore
    ├── dcgan
    │   ├── LICENSE
    │   ├── README.md
    │   ├── dcgan.py
    │   └── entry_point.py
    ├── gather_raw_data.sh
    ├── gnmt
    │   ├── README.md
    │   ├── __init__.py
    │   ├── entry_point.py
    │   └── seq2seq
    │   │   ├── LICENSE
    │   │   ├── data
    │   │       ├── config.py
    │   │       ├── dataset.py
    │   │       ├── sampler.py
    │   │       └── tokenizer.py
    │   │   ├── inference
    │   │       ├── beam_search.py
    │   │       └── inference.py
    │   │   ├── models
    │   │       ├── attention.py
    │   │       ├── decoder.py
    │   │       ├── encoder.py
    │   │       ├── gnmt.py
    │   │       └── seq2seq_base.py
    │   │   ├── train
    │   │       ├── fp_optimizers.py
    │   │       ├── lr_scheduler.py
    │   │       ├── smoothing.py
    │   │       └── trainer.py
    │   │   └── utils.py
    ├── inception
    │   ├── LICENSE
    │   ├── README.md
    │   ├── entry_point.py
    │   └── inception.py
    ├── process_raw_data.sh
    ├── process_results.py
    ├── resnet
    │   ├── LICENSE
    │   ├── README.md
    │   ├── __init__.py
    │   ├── entry_point.py
    │   └── resnet.py
    ├── run_experiment.py
    └── transformer
    │   ├── LICENSE
    │   ├── README.md
    │   ├── __init__.py
    │   ├── entry_point.py
    │   └── tfmr
    │       ├── Beam.py
    │       ├── Constants.py
    │       ├── Layers.py
    │       ├── Models.py
    │       ├── Modules.py
    │       ├── Optim.py
    │       ├── SubLayers.py
    │       ├── Translator.py
    │       └── __init__.py
└── tools
    ├── device-metadata
        ├── README.md
        └── measure_peak_flops.py
    ├── kernel-metadata
        ├── extract.sh
        └── process-cuobjdump-output.py
    └── recording
        ├── combine_data.py
        ├── database.py
        ├── features.py
        ├── record_bmm.py
        ├── record_common.py
        ├── record_conv2d.py
        ├── record_linear.py
        ├── record_lstm.py
        └── to_dataset.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.swp
 3 | 
 4 | # PyTorch serialized modules
 5 | *.pt
 6 | 
 7 | analyzer/habitat/data/LICENSE
 8 | analyzer/habitat/data/NOTICE
 9 | analyzer/habitat/data/README.md
10 | analyzer/habitat/data/kernels.sqlite
11 | analyzer/habitat/data/**/*.pth
12 | 
13 | # Python
14 | *.pyc
15 | __pycache__
16 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "cpp/external/pybind11"]
2 | 	path = cpp/external/pybind11
3 | 	url = https://github.com/pybind/pybind11
4 | [submodule "cpp/external/gflags"]
5 | 	path = cpp/external/gflags
6 | 	url = https://github.com/gflags/gflags
7 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use Habitat, please cite it as below."
 3 | authors:
 4 | - family-names: "Yu"
 5 |   given-names: "Geoffrey X."
 6 | - family-names: "Gao"
 7 |   given-names: "Yubo"
 8 | - family-names: "Golikov"
 9 |   given-names: "Pavel"
10 | - family-names: "Pekhimenko"
11 |   given-names: "Gennady"
12 | title: "Habitat: A Runtime-Based Computational Performance Predictor for Deep Neural Network Training"
13 | version: 1.0.0
14 | doi: 10.5281/zenodo.4885489
15 | date-released: 2021-06-01
16 | url: "https://github.com/geoffxy/habitat"
17 | preferred-citation:
18 |   type: conference-paper
19 |   authors:
20 |   - family-names: "Yu"
21 |     given-names: "Geoffrey X."
22 |   - family-names: "Gao"
23 |     given-names: "Yubo"
24 |   - family-names: "Golikov"
25 |     given-names: "Pavel"
26 |   - family-names: "Pekhimenko"
27 |     given-names: "Gennady"
28 |   collection-title: "Proceedings of the 2021 USENIX Annual Technical Conference (USENIX ATC '21)"
29 |   start: 503
30 |   end: 521
31 |   title: "Habitat: A Runtime-Based Computational Performance Predictor for Deep Neural Network Training"
32 |   month: 7
33 |   year: 2021
34 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | Copyright 2021 Geoffrey Yu
 2 | Copyright 2021 Yubo Gao
 3 | Copyright 2021 Pavel Golikov
 4 | Copyright 2021 Gennady Pekhimenko
 5 | 
 6 | Licensed under the Apache License, Version 2.0 (the "License");
 7 | you may not use this project except in compliance with the License.
 8 | You may obtain a copy of the License at
 9 | 
10 |    http://www.apache.org/licenses/LICENSE-2.0
11 | 
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Habitat: A Runtime-Based Computational Performance Predictor for Deep Neural Network Training
 2 | 
 3 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4885489.svg)](https://doi.org/10.5281/zenodo.4885489)
 4 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4876277.svg)](https://doi.org/10.5281/zenodo.4876277)
 5 | 
 6 | Habitat is a tool that predicts a deep neural network's training iteration
 7 | execution time on a given GPU. It currently supports PyTorch. To learn more
 8 | about how Habitat works, please see our [research
 9 | paper](https://arxiv.org/abs/2102.00527).
10 | 
11 | 
12 | ## Running From Source
13 | 
14 | Currently, the only way to run Habitat is to build it from source. You should
15 | use the Docker image provided in this repository to make sure that you can
16 | compile the code.
17 | 
18 | 1. Download the [Habitat pre-trained
19 |    models](https://doi.org/10.5281/zenodo.4876277).
20 | 2. Run `extract-models.sh` under `analyzer` to extract and install the
21 |    pre-trained models.
22 | 3. Run `setup.sh` under `docker/` to build the Habitat container image.
23 | 4. Run `start.sh` to start a new container. By default, your home directory
24 |    will be mounted inside the container under `~/home`.
25 | 5. Once inside the container, run `install-dev.sh` under `analyzer/` to build
26 |    and install the Habitat package.
27 | 6. In your scripts, `import habitat` to get access to Habitat. See
28 |    `experiments/run_experiment.py` for an example showing how to use Habitat.
29 | 
30 | **Note:** Habitat needs access to your GPU's performance counters, which
31 | requires special permissions if you are running with a recent driver (418.43 or
32 | later). If you encounter a `CUPTI_ERROR_INSUFFICIENT_PRIVILEGES` error when
33 | running Habitat, please follow the instructions
34 | [here](https://developer.nvidia.com/ERR_NVGPUCTRPERM)
35 | and in [issue #5](https://github.com/geoffxy/habitat/issues/5).
36 | 
37 | 
38 | ## License
39 | 
40 | The code in this repository is licensed under the Apache 2.0 license (see
41 | `LICENSE` and `NOTICE`), with the exception of the files mentioned below.
42 | 
43 | This software contains source code provided by NVIDIA Corporation. These files
44 | are:
45 | 
46 | - The code under `cpp/external/cupti_profilerhost_util/` (CUPTI sample code)
47 | - `cpp/src/cuda/cuda_occupancy.h`
48 | 
49 | The code mentioned above is licensed under the [NVIDIA Software Development
50 | Kit End User License Agreement](https://docs.nvidia.com/cuda/eula/index.html).
51 | 
52 | We include the implementations of several deep neural networks under
53 | `experiments/` for our evaluation. These implementations are copyrighted by
54 | their original authors and carry their original licenses. Please see the
55 | corresponding `README` files and license files inside the subdirectories for
56 | more information.
57 | 
58 | 
59 | ## Research Paper
60 | 
61 | Habitat began as a research project in the [EcoSystem
62 | Group](https://www.cs.toronto.edu/ecosystem) at the [University of
63 | Toronto](https://cs.toronto.edu). The accompanying research paper will appear
64 | in the proceedings of [USENIX
65 | ATC'21](https://www.usenix.org/conference/atc21/presentation/yu). If you are
66 | interested, you can read a preprint of the paper
67 | [here](https://arxiv.org/abs/2102.00527).
68 | 
69 | If you use Habitat in your research, please consider citing our paper:
70 | 
71 | ```bibtex
72 | @inproceedings{habitat-yu21,
73 |   author = {Yu, Geoffrey X. and Gao, Yubo and Golikov, Pavel and Pekhimenko,
74 |     Gennady},
75 |   title = {{Habitat: A Runtime-Based Computational Performance Predictor for
76 |     Deep Neural Network Training}},
77 |   booktitle = {{Proceedings of the 2021 USENIX Annual Technical Conference
78 |     (USENIX ATC'21)}},
79 |   year = {2021},
80 | }
81 | ```
82 | 


--------------------------------------------------------------------------------
/analyzer/.gitignore:
--------------------------------------------------------------------------------
1 | habitat_predict.egg-info
2 | habitat/habitat_cuda.cpython-36m-x86_64-linux-gnu.so
3 | 


--------------------------------------------------------------------------------
/analyzer/README.md:
--------------------------------------------------------------------------------
1 | Habitat
2 | =======
3 | This directory contains the Python source code for Habitat—a tool that predicts
4 | the execution time of DNN operations across different GPUs.
5 | 


--------------------------------------------------------------------------------
/analyzer/extract-models.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | if [ -z $1 ]; then
 6 |   >&2 echo "Usage: $0 path/to/habitat-models.tar.gz"
 7 |   >&2 echo ""
 8 |   >&2 echo "This script extracts and installs Habitat's pre-trained models."
 9 |   exit 1
10 | fi
11 | 
12 | archive_loc=$(pwd)/$1
13 | 
14 | script_loc=$(cd $(dirname $0) && pwd -P)
15 | cd $script_loc
16 | 
17 | tar xzf $archive_loc -C habitat/data/
18 | cd habitat/data/
19 | 
20 | ./verify.sh
21 | 


--------------------------------------------------------------------------------
/analyzer/habitat/__init__.py:
--------------------------------------------------------------------------------
 1 | from habitat.analysis import Device
 2 | from habitat.analysis.metrics import Metric
 3 | from habitat.analysis.predictor import Predictor
 4 | from habitat.tracking.operation import OperationTracker
 5 | 
 6 | __version__ = '1.0.0'
 7 | __description__ = 'Cross-GPU performance predictions for PyTorch neural network training.'
 8 | 
 9 | __author__ = 'Geoffrey Yu'
10 | __email__ = 'gxyu@cs.toronto.edu'
11 | 
12 | __license__ = 'Apache-2.0'
13 | 
14 | __all__ = [
15 |     'Device',
16 |     'Metric',
17 |     'OperationTracker',
18 |     'Predictor',
19 | ]
20 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/__init__.py:
--------------------------------------------------------------------------------
 1 | from habitat.analysis.device import _Device
 2 | 
 3 | Device = _Device()
 4 | 
 5 | SPECIAL_OPERATIONS = {
 6 |     # Convolution
 7 |     'conv2d',
 8 | 
 9 |     # Matrix multiply operations
10 |     'linear',
11 |     'bmm',
12 | 
13 |     # Recurrent operations
14 |     'lstm',
15 |     'gru',
16 |     'rnn_tanh',
17 |     'rnn_relu',
18 | }
19 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/arguments.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class Arguments:
 5 |     """
 6 |     Stores representations of an operation's arguments.
 7 |     """
 8 |     def __init__(self, args, kwargs):
 9 |         self.args = args
10 |         self.kwargs = kwargs
11 |         self.special = {}
12 | 
13 |     @classmethod
14 |     def from_raw_arguments(cls, args, kwargs):
15 |         processed_args = tuple(map(_process_argument, args))
16 |         processed_kwargs = {
17 |             arg_name: _process_argument(arg_value)
18 |             for arg_name, arg_value in kwargs.items()
19 |         }
20 |         return cls(processed_args, processed_kwargs)
21 | 
22 | 
23 | def _process_argument(argument):
24 |     if isinstance(argument, tuple):
25 |         return tuple(map(_process_argument, argument))
26 | 
27 |     if isinstance(argument, list):
28 |         return list(map(_process_argument, argument))
29 | 
30 |     # At this point we expect the argument to either be a
31 |     # torch.Tensor or to be a scalar (e.g., an integer).
32 |     if isinstance(argument, torch.Tensor):
33 |         # We only store the tensor dimensions
34 |         return argument.size()
35 |     else:
36 |         return argument
37 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/device.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # This singleton class simulates an enum that consists of the GPU devices we
 4 | # support. Users can access a device using its identifier (e.g., Device.V100).
 5 | class _Device:
 6 |     def __init__(self):
 7 |         self._devices = None
 8 | 
 9 |     def __getattr__(self, device_name):
10 |         if self._devices is None:
11 |             # Lazily load the devices on the first access
12 |             self._load_devices()
13 |         return self._devices[device_name]
14 | 
15 |     def _load_devices(self):
16 |         import yaml
17 |         import habitat.habitat_cuda as hc
18 |         import habitat.data as hd
19 |         with open(hd.path_to_data('devices.yml')) as devices_yaml:
20 |             devices = yaml.load(devices_yaml, Loader=yaml.Loader)
21 |         self._devices = {
22 |             device_name: hc.DeviceProperties(name=device_name, **properties)
23 |             for device_name, properties in devices.items()
24 |         }
25 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/kernels.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class MeasuredKernel:
 4 |     def __init__(self, time_kernel, metrics_kernels, device):
 5 |         self._c = time_kernel
 6 |         self._metrics_kernels = metrics_kernels
 7 |         self._device = device
 8 |         self._cached_metrics = {}
 9 | 
10 |     def get_metric(self, metric_info, default=None):
11 |         if metric_info in self._cached_metrics:
12 |             return self._cached_metrics[metric_info]
13 | 
14 |         for metric_kernel in self._metrics_kernels:
15 |             for raw_metric_name, raw_metric_value in metric_kernel.metrics:
16 |                 if (raw_metric_name == metric_info.value.cupti_name or
17 |                         raw_metric_name == metric_info.value.legacy_cupti_name):
18 |                     canonical_value = metric_info.value.to_canonical_value(
19 |                         raw_metric_value, self._device)
20 |                     self._cached_metrics[metric_info] = canonical_value
21 |                     return canonical_value
22 | 
23 |         if default is None:
24 |             raise AttributeError('Unknown metric: {}'.format(metric_info.name))
25 | 
26 |         return default
27 | 
28 |     def __getattr__(self, name):
29 |         # Delegate to the underlying C++ object for non-overridden attributes
30 |         return getattr(self._c, name)
31 | 
32 | 
33 | class PredictedKernel:
34 |     def __init__(self, measured_kernel, run_time_ns):
35 |         self._measured_kernel = measured_kernel
36 |         self._run_time_ns = run_time_ns
37 | 
38 |     @property
39 |     def run_time_ns(self):
40 |         return self._run_time_ns
41 | 
42 |     @property
43 |     def name(self):
44 |         return self._measured_kernel.name
45 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/metrics.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class _MetricInfo:
 5 |     def __init__(
 6 |         self,
 7 |         cupti_name,
 8 |         legacy_cupti_name,
 9 |         legacy_to_canonical_fn
10 |     ):
11 |         self._cupti_name = cupti_name
12 |         self._legacy_cupti_name = legacy_cupti_name
13 |         self._legacy_to_canonical_fn = legacy_to_canonical_fn
14 | 
15 |     @property
16 |     def cupti_name(self):
17 |         return self._cupti_name
18 | 
19 |     @property
20 |     def legacy_cupti_name(self):
21 |         return self._legacy_cupti_name
22 | 
23 |     def to_canonical_value(self, value, device):
24 |         if device.compute_capability[0] >= 7:
25 |             return value
26 |         return self._legacy_to_canonical_fn(value)
27 | 
28 | 
29 | class Metric(Enum):
30 |     DRAMUtilization = _MetricInfo(
31 |         'dram__throughput.avg.pct_of_peak_sustained_elapsed',
32 |         'dram_utilization',
33 |         lambda value: value * 10,
34 |     )
35 |     DRAMReadBytes = _MetricInfo(
36 |         'dram__bytes_read.sum',
37 |         'dram_read_bytes',
38 |         lambda value: value,
39 |     )
40 |     DRAMWriteBytes = _MetricInfo(
41 |         'dram__bytes_write.sum',
42 |         'dram_write_bytes',
43 |         lambda value: value,
44 |     )
45 |     SinglePrecisionFLOPEfficiency = _MetricInfo(
46 |         'smsp__sass_thread_inst_executed_ops_fadd_fmul_ffma_pred_on.avg.pct_of_peak_sustained_elapsed',
47 |         'flop_sp_efficiency',
48 |         lambda value: value,
49 |     )
50 |     SinglePrecisionAddOps = _MetricInfo(
51 |         'smsp__sass_thread_inst_executed_op_fadd_pred_on.sum',
52 |         'flop_count_sp_add',
53 |         lambda value: value,
54 |     )
55 | 
56 | 
57 | def resolve_metrics(metrics, device):
58 |     """
59 |     Converts Metric enum values into raw metric strings that can be passed to
60 |     CUPTI, depending on the compute capability of the given device.
61 | 
62 |     This is needed because the metrics names changed after (and including)
63 |     compute capability 7.0 (Volta).
64 | 
65 |     If the metrics passed in are already resolved, this function will return a
66 |     copy of them.
67 |     """
68 |     if metrics is None:
69 |         return []
70 | 
71 |     if isinstance(metrics, list) or isinstance(metrics, tuple):
72 |         return [
73 |             _get_metric_name(metric, device)
74 |             for metric in metrics
75 |         ]
76 |     else:
77 |         return [_get_metric_name(metrics, device)]
78 | 
79 | 
80 | def _get_metric_name(metric, device):
81 |     if isinstance(metric, Metric):
82 |         return (
83 |             metric.value.cupti_name
84 |             if device.compute_capability[0] >= 7
85 |             else metric.value.legacy_cupti_name
86 |         )
87 |     else:
88 |         return metric
89 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/mlp/.gitignore:
--------------------------------------------------------------------------------
1 | logs
2 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/mlp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoffxy/habitat/5f01e523a1dc30dbfbaaa39cf4880a534c7781a2/analyzer/habitat/analysis/mlp/__init__.py


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/mlp/dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | 
 4 | from torch.utils.data import Dataset
 5 | 
 6 | from habitat.analysis.mlp.dataset_process import get_dataset
 7 | 
 8 | 
 9 | class HabitatDataset(Dataset):
10 |     def __init__(self, dataset_path, features):
11 |         self.x, self.y = get_dataset(dataset_path, features)
12 | 
13 |         # input normalization
14 |         self.x = np.array(self.x)
15 | 
16 |         self.mu = np.mean(self.x, axis=0)
17 |         self.sigma = np.std(self.x, axis=0)
18 | 
19 |         self.x = np.divide(np.subtract(self.x, self.mu), self.sigma)
20 | 
21 |     def __len__(self):
22 |         return len(self.y)
23 | 
24 |     def __getitem__(self, idx):
25 |         if torch.is_tensor(idx):
26 |             idx = idx.tolist()
27 | 
28 |         return torch.from_numpy(np.array(self.x[idx]).astype(np.float32), ), float(self.y[idx])
29 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/mlp/dataset_process.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import pandas as pd
 3 | import glob
 4 | import functools
 5 | from tqdm import tqdm
 6 | 
 7 | from habitat.analysis.mlp.devices import get_device_features, get_all_devices
 8 | 
 9 | 
10 | def get_dataset(path, features, device_features=None):
11 |     if device_features is None:
12 |         device_features = ['mem', 'mem_bw', 'num_sm', 'single']
13 | 
14 |     SELECT_QUERY = """
15 |       SELECT {features}, SUM(run_time_ms) AS run_time_ms
16 |       FROM recordings
17 |       GROUP BY {features}
18 |     """
19 | 
20 |     # read datasets
21 |     files = glob.glob(path + "/*.sqlite")
22 | 
23 |     # read individual sqlite files and categorize by device
24 |     devices = dict()
25 |     for f in files:
26 |         device_name = f.split("/")[-1].split("-")[1]
27 | 
28 |         conn = sqlite3.connect(f)
29 |         query = SELECT_QUERY.format(features=",".join(features))
30 | 
31 |         df = pd.read_sql_query(query, conn)
32 |         df = df.rename(columns={"run_time_ms": device_name})
33 | 
34 |         print("Loaded file %s (%d entries)" % (f, len(df.index)))
35 | 
36 |         if device_name not in devices:
37 |             devices[device_name] = df
38 |         else:
39 |             devices[device_name] = devices[device_name].append(df)
40 | 
41 |     for device in devices.keys():
42 |         print("Device %s contains %d entries" % (device, len(devices[device].index)))
43 | 
44 |     print()
45 | 
46 |     print("Merging")
47 |     df_merged = functools.reduce(
48 |         lambda df1, df2: pd.merge(df1, df2, on=features),
49 |         devices.values()
50 |     )
51 | 
52 |     print("Generating dataset")
53 |     # generate vectorized dataset (one entry for each device with device params)
54 |     device_params = get_all_devices(device_features)
55 | 
56 |     x, y = [], []
57 |     for device in devices.keys():
58 |         df_merged_device = df_merged[features + [device, ]]
59 |         for row in tqdm(df_merged_device.iterrows(), leave=False, desc=device, total=len(df_merged_device.index)):
60 |             row = row[1]
61 | 
62 |             x.append(list(row[:-1]) + device_params[device])
63 |             y.append(row[-1])
64 | 
65 |     return x, y
66 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/mlp/devices.csv:
--------------------------------------------------------------------------------
1 | device,mem,mem_type,mem_bw,num_sm,double,single,half
2 | P100,16,HBM2,732,56,4.7,9.3,18.7
3 | P4000,8,GDDR5,243,14,0.1656,5.3,
4 | RTX2070,8,GDDR6,448,36,0.20304,6.49728,12.99456
5 | RTX2080Ti,11,GDDR6,616,68,0.3672,11.7504,23.5008
6 | T4,16,GDDR6,320,40,,8.1,
7 | V100,16,HBM2,900,80,,14.028,
8 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/mlp/devices.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def get_device_features(device_name, device_params):
 6 |     file_dir = os.path.abspath(os.path.dirname(__file__))
 7 |     df = pd.read_csv(os.path.join(file_dir, "devices.csv"))
 8 |     df = df[['device', ] + device_params]
 9 |     result = df[df['device'] == device_name].iloc[0]
10 |     return list(result)[1:]
11 | 
12 | def get_all_devices(device_params=None):
13 |     file_dir = os.path.abspath(os.path.dirname(__file__))
14 |     df = pd.read_csv(os.path.join(file_dir, "devices.csv"))
15 |     if type(device_params) is list:
16 |         df = df[['device',] + device_params]
17 | 
18 |     return {
19 |         row[1]: list(row[2:]) for row in df.itertuples()
20 |     }
21 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/mlp/saved_models/.gitignore:
--------------------------------------------------------------------------------
1 | bmm/*
2 | conv2d/*
3 | linear/*
4 | lstm/*
5 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/mlp/train.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import random
 3 | import torch
 4 | import numpy
 5 | 
 6 | from habitat.analysis.mlp.mlp import RuntimePredictor
 7 | 
 8 | 
 9 | def main():
10 |     parser = argparse.ArgumentParser(description="MLP Training Script")
11 |     parser.add_argument("operation", type=str)
12 |     parser.add_argument("dataset_path", type=str)
13 |     parser.add_argument("--layers", type=int, default=8)
14 |     parser.add_argument("--layer_size", type=int, default=1024)
15 |     parser.add_argument("--epochs", type=int, default=80)
16 |     parser.add_argument("--seed", type=int, default=1337)
17 | 
18 |     args = parser.parse_args()
19 | 
20 |     # Ensure reproducibility
21 |     random.seed(args.seed)
22 |     torch.manual_seed(args.seed)
23 |     numpy.random.seed(args.seed)
24 | 
25 |     predictor = RuntimePredictor(args.operation, args.layers, args.layer_size)
26 |     predictor.train_with_dataset(args.dataset_path, epochs=args.epochs)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     main()
31 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/operation.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | class Operation:
  4 |     """
  5 |     Abstract representation of a logical operation in a model.
  6 |     """
  7 |     def __repr__(self):
  8 |         return self.name
  9 | 
 10 |     @property
 11 |     def run_time_ms(self):
 12 |         if self.backward is None:
 13 |             return self.forward.run_time_ms
 14 |         return self.forward.run_time_ms + self.backward.run_time_ms
 15 | 
 16 |     @property
 17 |     def ktime_ns(self):
 18 |         if self.backward is None:
 19 |             return self.forward.ktime_ns
 20 |         return self.forward.ktime_ns + self.backward.ktime_ns
 21 | 
 22 |     @property
 23 |     def arguments(self):
 24 |         return None
 25 | 
 26 |     @property
 27 |     def forward(self):
 28 |         raise NotImplementedError
 29 | 
 30 |     @property
 31 |     def backward(self):
 32 |         raise NotImplementedError
 33 | 
 34 |     @property
 35 |     def name(self):
 36 |         raise NotImplementedError
 37 | 
 38 |     @property
 39 |     def device(self):
 40 |         raise NotImplementedError
 41 | 
 42 |     def to_device(self, dest_device, predictor):
 43 |         raise NotImplementedError
 44 | 
 45 | 
 46 | class MeasuredOperation(Operation):
 47 |     def __init__(
 48 |         self,
 49 |         name,
 50 |         arguments,
 51 |         forward,
 52 |         backward,
 53 |         device,
 54 |     ):
 55 |         super().__init__()
 56 |         self._name = name
 57 |         self._arguments = arguments
 58 |         self._forward = forward
 59 |         self._backward = backward
 60 |         self._device = device
 61 | 
 62 |     @property
 63 |     def name(self):
 64 |         return self._name
 65 | 
 66 |     @property
 67 |     def arguments(self):
 68 |         return self._arguments
 69 | 
 70 |     @property
 71 |     def forward(self):
 72 |         return self._forward
 73 | 
 74 |     @property
 75 |     def backward(self):
 76 |         return self._backward
 77 | 
 78 |     @property
 79 |     def device(self):
 80 |         return self._device
 81 | 
 82 |     def to_device(self, dest_device, predictor):
 83 |         if dest_device.name == self._device.name:
 84 |             return self
 85 |         return predictor.predict_operation(self, dest_device)
 86 | 
 87 | 
 88 | class PredictedOperation(Operation):
 89 |     def __init__(
 90 |         self,
 91 |         measured_operation,
 92 |         forward,
 93 |         backward,
 94 |         device
 95 |     ):
 96 |         self._measured_operation = measured_operation
 97 |         self._forward = forward
 98 |         self._backward = backward
 99 |         self._device = device
100 | 
101 |     @property
102 |     def name(self):
103 |         return self._measured_operation.name
104 | 
105 |     @property
106 |     def arguments(self):
107 |         return self._measured_operation.arguments
108 | 
109 |     @property
110 |     def forward(self):
111 |         return self._forward
112 | 
113 |     @property
114 |     def backward(self):
115 |         return self._backward
116 | 
117 |     @property
118 |     def device(self):
119 |         return self._device
120 | 
121 |     def to_device(self, dest_device, predictor):
122 |         raise RuntimeError(
123 |             'Cannot make a prediction using a predicted operation.',
124 |         )
125 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/run_time.py:
--------------------------------------------------------------------------------
 1 | from habitat.analysis.kernels import MeasuredKernel
 2 | from habitat.utils import ns_to_ms
 3 | 
 4 | 
 5 | class RunTime:
 6 |     @property
 7 |     def run_time_ms(self):
 8 |         raise NotImplementedError
 9 | 
10 |     @property
11 |     def ktime_ns(self):
12 |         return sum(map(lambda k: k.run_time_ns, self.kernels))
13 | 
14 |     @property
15 |     def kernels(self):
16 |         return []
17 | 
18 |     @property
19 |     def device(self):
20 |         raise NotImplementedError
21 | 
22 | 
23 | class RunTimeMeasurement(RunTime):
24 |     def __init__(self, run_time_ms, kernels, device):
25 |         self._run_time_ms = run_time_ms
26 |         self._kernels = kernels
27 |         self._device = device
28 | 
29 |     @property
30 |     def run_time_ms(self):
31 |         return self._run_time_ms
32 | 
33 |     @property
34 |     def kernels(self):
35 |         return self._kernels
36 | 
37 |     @property
38 |     def device(self):
39 |         return self._device
40 | 
41 | 
42 | class RunTimePrediction(RunTime):
43 |     def __init__(self, overhead_ns, predicted_kernels, device):
44 |         self._run_time_ms = None
45 |         self._overhead_ns = overhead_ns
46 |         self._predicted_kernels = predicted_kernels
47 |         self._device = device
48 | 
49 |     @property
50 |     def run_time_ms(self):
51 |         if self._run_time_ms is not None:
52 |             return self._run_time_ms
53 |         run_time_ns = self._overhead_ns + sum(map(
54 |             lambda k: k.run_time_ns,
55 |             self.kernels,
56 |         ))
57 |         self._run_time_ms = ns_to_ms(run_time_ns)
58 |         return self._run_time_ms
59 | 
60 |     @property
61 |     def kernels(self):
62 |         return self._predicted_kernels
63 | 
64 |     @property
65 |     def device(self):
66 |         return self._device
67 | 
68 | 
69 | class RunTimePurePrediction(RunTime):
70 |     def __init__(self, run_time_ms, device):
71 |         self._run_time_ms = run_time_ms
72 |         self._device = device
73 | 
74 |     @property
75 |     def run_time_ms(self):
76 |         if self._run_time_ms is not None:
77 |             return self._run_time_ms
78 |         run_time_ns = self._overhead_ns + sum(map(
79 |             lambda k: k.run_time_ns,
80 |             self.kernels,
81 |         ))
82 |         self._run_time_ms = ns_to_ms(run_time_ns)
83 |         return self._run_time_ms
84 | 
85 |     @property
86 |     def device(self):
87 |         return self._device
88 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/trace.py:
--------------------------------------------------------------------------------
 1 | from itertools import chain
 2 | from habitat.analysis.predictor import Predictor
 3 | 
 4 | 
 5 | class Trace:
 6 |     """
 7 |     Represents an operation trace that was measured on a given device.
 8 |     """
 9 | 
10 |     # Used by default to make cross-device predictions.
11 |     DefaultPredictor = Predictor()
12 | 
13 |     def __init__(self, device, operations):
14 |         self._device = device
15 |         self._operations = operations
16 |         self._run_time_ms = None
17 | 
18 |     @property
19 |     def operations(self):
20 |         return self._operations
21 | 
22 |     @property
23 |     def device(self):
24 |         return self._device
25 | 
26 |     @property
27 |     def run_time_ms(self):
28 |         if self._run_time_ms is not None:
29 |             return self._run_time_ms
30 | 
31 |         self._run_time_ms = sum(map(
32 |             lambda op: op.run_time_ms,
33 |             self._operations,
34 |         ))
35 | 
36 |         return self._run_time_ms
37 | 
38 |     def to_device(self, dest_device, predictor=None):
39 |         """Get a predicted trace for the specified device."""
40 |         if dest_device.name == self.device.name:
41 |             return self
42 | 
43 |         actual_predictor = (
44 |             Trace.DefaultPredictor if predictor is None else predictor
45 |         )
46 | 
47 |         operations = [
48 |             operation.to_device(dest_device, actual_predictor)
49 |             for operation in self._operations
50 |         ]
51 |         return Trace(dest_device, operations)
52 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/wave_scaling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoffxy/habitat/5f01e523a1dc30dbfbaaa39cf4880a534c7781a2/analyzer/habitat/analysis/wave_scaling/__init__.py


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/wave_scaling/common.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | def calculate_wave_info(kernel, origin_device, dest_device, metadata_manager):
 4 |     origin_occupancy = kernel.thread_block_occupancy(origin_device)
 5 |     origin_wave_size = origin_device.num_sms * origin_occupancy
 6 | 
 7 |     dest_registers_per_thread = metadata_manager.kernel_registers_for(
 8 |         kernel,
 9 |         dest_device,
10 |     )
11 |     if dest_registers_per_thread is not None:
12 |         dest_occupancy = kernel.thread_block_occupancy(
13 |             dest_device,
14 |             dest_registers_per_thread,
15 |         )
16 |     else:
17 |         dest_occupancy = kernel.thread_block_occupancy(dest_device)
18 |     dest_wave_size = dest_device.num_sms * dest_occupancy
19 | 
20 |     return origin_wave_size, dest_wave_size, origin_occupancy, dest_occupancy
21 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/wave_scaling/metadata.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | import logging
 3 | 
 4 | logger = logging.getLogger(__name__)
 5 | 
 6 | 
 7 | class MetadataManager:
 8 |     def __init__(self, path_to_lut):
 9 |         self._connection = sqlite3.connect(path_to_lut)
10 | 
11 | 
12 |     def kernel_registers_for(self, kernel, device):
13 |         arch = int(''.join(map(lambda x: str(x), device.compute_capability)))
14 |         cursor = self._connection.cursor()
15 |         result = cursor.execute(
16 |             MetadataManager.kernel_registers_query,
17 |             (kernel.name, arch),
18 |         ).fetchone()
19 | 
20 |         if result is None:
21 |             logger.debug(
22 |                 'Missing kernel metadata entry for "%s" on arch %d.',
23 |                 kernel.name,
24 |                 arch,
25 |             )
26 |             return result
27 | 
28 |         actual_arch, registers_per_thread = result
29 |         if actual_arch != arch:
30 |             logger.debug(
31 |                 'Using substitute entry for "%s" at arch %d instead of %d.',
32 |                 kernel.name,
33 |                 actual_arch,
34 |                 arch,
35 |             )
36 | 
37 |         return registers_per_thread
38 | 
39 | 
40 | MetadataManager.kernel_registers_query = """
41 |   SELECT arch, registers_per_thread FROM kernels
42 |   WHERE name = ? AND arch <= ?
43 |   ORDER BY arch DESC LIMIT 1
44 | """
45 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/wave_scaling/resimplified.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | from habitat.analysis.kernels import PredictedKernel
 4 | from habitat.analysis.wave_scaling.common import calculate_wave_info
 5 | 
 6 | 
 7 | def resimplified_wave_scaling(
 8 |     kernel,
 9 |     origin_device,
10 |     dest_device,
11 |     metadata_manager,
12 | ):
13 |     origin_wave_size, dest_wave_size, origin_occupancy, dest_occupancy = (
14 |         calculate_wave_info(
15 |             kernel,
16 |             origin_device,
17 |             dest_device,
18 |             metadata_manager,
19 |         )
20 |     )
21 | 
22 |     # Check if the kernel is too "small" - if it doesn't fill a single wave
23 |     # on the current device AND if it doesn't fill a single wave on the
24 |     # destination device
25 |     if (kernel.num_blocks // origin_wave_size == 0 and
26 |             kernel.num_blocks // dest_wave_size == 0):
27 |         # We scale the run time by the compute factor only
28 |         origin_max_occupancy = math.ceil(
29 |             kernel.num_blocks / origin_device.num_sms
30 |         )
31 |         dest_max_occupancy = math.ceil(
32 |             kernel.num_blocks / dest_device.num_sms
33 |         )
34 |         return PredictedKernel(kernel, kernel.run_time_ns)
35 | 
36 |     bandwidth_ratio = (
37 |         origin_device.mem_bandwidth_gb / dest_device.mem_bandwidth_gb
38 |     )
39 | 
40 |     return PredictedKernel(kernel, kernel.run_time_ns * bandwidth_ratio)
41 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/wave_scaling/roofline.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | from habitat.analysis.metrics import Metric
  4 | from habitat.analysis.kernels import PredictedKernel
  5 | from habitat.analysis.wave_scaling.common import calculate_wave_info
  6 | 
  7 | 
  8 | def roofline_wave_scaling(
  9 |     kernel,
 10 |     origin_device,
 11 |     dest_device,
 12 |     metadata_manager,
 13 | ):
 14 |     gamma = _roofline_gamma(kernel, origin_device, dest_device)
 15 |     gamma_compl = 1.0 - gamma
 16 | 
 17 |     origin_wave_size, dest_wave_size, origin_occupancy, dest_occupancy = (
 18 |         calculate_wave_info(
 19 |             kernel,
 20 |             origin_device,
 21 |             dest_device,
 22 |             metadata_manager,
 23 |         )
 24 |     )
 25 | 
 26 |     # 1. Check if the kernel is too "small" - if it doesn't fill a single wave
 27 |     #    on the current device AND if it doesn't fill a single wave on the
 28 |     #    destination device
 29 |     if (kernel.num_blocks // origin_wave_size == 0 and
 30 |             kernel.num_blocks // dest_wave_size == 0):
 31 |         # We scale the run time by the compute factor only
 32 |         origin_max_occupancy = math.ceil(
 33 |             kernel.num_blocks / origin_device.num_sms
 34 |         )
 35 |         dest_max_occupancy = math.ceil(
 36 |             kernel.num_blocks / dest_device.num_sms
 37 |         )
 38 |         partial_compute_factor = (
 39 |             (origin_device.base_clock_mhz / dest_device.base_clock_mhz) *
 40 |             (dest_max_occupancy / origin_max_occupancy)
 41 |         )
 42 |         return PredictedKernel(
 43 |             kernel,
 44 |             kernel.run_time_ns * math.pow(partial_compute_factor, gamma_compl),
 45 |         )
 46 | 
 47 |     # 2. Compute the three scaling factors
 48 |     bandwidth_factor = (
 49 |         origin_device.mem_bandwidth_gb / dest_device.mem_bandwidth_gb
 50 |     )
 51 |     clock_factor = (
 52 |         origin_device.base_clock_mhz / dest_device.base_clock_mhz
 53 |     )
 54 |     sm_factor = (
 55 |         origin_device.num_sms / dest_device.num_sms
 56 |     )
 57 | 
 58 |     # 3. Scale and return the predicted run time
 59 |     scaled_run_time_ns = (
 60 |         kernel.run_time_ns *
 61 |         math.pow(bandwidth_factor, gamma) *
 62 |         math.pow(clock_factor, gamma_compl) *
 63 |         math.pow(sm_factor, gamma_compl)
 64 |     )
 65 |     return PredictedKernel(kernel, scaled_run_time_ns)
 66 | 
 67 | 
 68 | def _roofline_gamma(kernel, origin_device, dest_device):
 69 |     flop_efficiency = kernel.get_metric(Metric.SinglePrecisionFLOPEfficiency)
 70 |     dram_read_bytes = kernel.get_metric(Metric.DRAMReadBytes)
 71 |     dram_write_bytes = kernel.get_metric(Metric.DRAMWriteBytes)
 72 |     total_gb = (dram_read_bytes + dram_write_bytes) / 1024 / 1024 / 1024
 73 | 
 74 |     gflops_per_second = flop_efficiency / 100 * origin_device.peak_gflops_per_second
 75 |     num_gflops = gflops_per_second * kernel.run_time_ns / 1e9
 76 | 
 77 |     # We only consider the dest ridge point (R).
 78 |     # We use a decreasing linear function to interpolate between an intensity
 79 |     # of 0 and R, and use a 1/x function to map intensities greater than R.
 80 |     #
 81 |     #  gamma = -0.5/R * intensity + 1  if 0 <= intensity <= R
 82 |     #          0.5R / intensity        otherwise
 83 | 
 84 |     if num_gflops < 1e-9:
 85 |         # We treat these cases as fully memory bandwidth bound, even though
 86 |         # total_gb could also be 0
 87 |         gamma = 1.
 88 | 
 89 |     elif total_gb == 0:
 90 |         # num_gflops must be non-zero, so this means the kernel is fully
 91 |         # compute bound
 92 |         gamma = 0.
 93 | 
 94 |     else:
 95 |         intensity_gflops_per_gb = num_gflops / total_gb
 96 |         dest_ridge_point = _ridge_point(dest_device)
 97 | 
 98 |         if intensity_gflops_per_gb > dest_ridge_point:
 99 |             gamma = 0.5 * dest_ridge_point / intensity_gflops_per_gb
100 |         else:
101 |             gamma = -0.5 / dest_ridge_point * intensity_gflops_per_gb + 1.
102 | 
103 |     assert gamma >= 0 and gamma <= 1
104 |     return gamma
105 | 
106 | 
107 | def _ridge_point(device):
108 |     return device.peak_gflops_per_second / device.mem_bandwidth_gb
109 | 


--------------------------------------------------------------------------------
/analyzer/habitat/analysis/wave_scaling/unified.py:
--------------------------------------------------------------------------------
 1 | from habitat.analysis.metrics import Metric
 2 | from habitat.analysis.wave_scaling.resimplified import (
 3 |     resimplified_wave_scaling,
 4 | )
 5 | from habitat.analysis.wave_scaling.roofline import roofline_wave_scaling
 6 | 
 7 | 
 8 | def unified_wave_scaling(
 9 |     kernel,
10 |     origin_device,
11 |     dest_device,
12 |     metadata_manager,
13 | ):
14 |     try:
15 |         # Try reading metrics. These calls will raise exceptions if the metrics
16 |         # do not exist.
17 |         _ = kernel.get_metric(Metric.SinglePrecisionFLOPEfficiency)
18 |         _ = kernel.get_metric(Metric.DRAMReadBytes)
19 |         _ = kernel.get_metric(Metric.DRAMWriteBytes)
20 |         return roofline_wave_scaling(
21 |             kernel,
22 |             origin_device,
23 |             dest_device,
24 |             metadata_manager,
25 |         )
26 |     except AttributeError:
27 |         pass
28 | 
29 |     # Use resimplified wave scaling when metrics are unavailable
30 |     return resimplified_wave_scaling(
31 |         kernel,
32 |         origin_device,
33 |         dest_device,
34 |         metadata_manager,
35 |     )
36 | 


--------------------------------------------------------------------------------
/analyzer/habitat/data/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | _DATA_PATH = os.path.abspath(os.path.dirname(__file__))
4 | 
5 | 
6 | def path_to_data(data_file):
7 |     return os.path.join(_DATA_PATH, data_file)
8 | 


--------------------------------------------------------------------------------
/analyzer/habitat/data/checksums:
--------------------------------------------------------------------------------
1 | d445af3308bc67a087446bed3d7c160fcde458cb  bmm/model.pth
2 | b7c4a4ccd2a447a6a9d63e6658461f1f1a9dc08b  conv2d/model.pth
3 | 659a00c6cff529613b40d5166fe7d93f42e8327d  kernels.sqlite
4 | 930d5a79b35755ae7a8ab72a9b5585423dc02ab7  linear/model.pth
5 | 2da58bcea595de031584da214b29510b9e897466  lstm/model.pth
6 | 


--------------------------------------------------------------------------------
/analyzer/habitat/data/devices.yml:
--------------------------------------------------------------------------------
  1 | # NOTE: All GPU device "names" need to be valid Python identifiers. Therefore
  2 | # they cannot start with a numeric character.
  3 | 
  4 | P4000:
  5 |   compute_major: 6
  6 |   compute_minor: 1
  7 |   max_threads_per_block: 1024
  8 |   max_threads_per_multiprocessor: 2048
  9 |   regs_per_block: 65536
 10 |   regs_per_multiprocessor: 65536
 11 |   warp_size: 32
 12 |   shared_mem_per_block: 49152
 13 |   shared_mem_per_multiprocessor: 98304
 14 |   num_sms: 14
 15 |   shared_mem_per_block_optin: 49152
 16 |   mem_bandwidth_gb: 195
 17 |   base_clock_mhz: 1227
 18 |   peak_gflops_per_second: 6054
 19 | 
 20 | GTX1080Ti:
 21 |   compute_major: 6
 22 |   compute_minor: 1
 23 |   max_threads_per_block: 1024
 24 |   max_threads_per_multiprocessor: 2048
 25 |   regs_per_block: 65536
 26 |   regs_per_multiprocessor: 65536
 27 |   warp_size: 32
 28 |   shared_mem_per_block: 49152
 29 |   shared_mem_per_multiprocessor: 98304
 30 |   num_sms: 28
 31 |   shared_mem_per_block_optin: 49152
 32 |   mem_bandwidth_gb: 484
 33 |   base_clock_mhz: 1480
 34 |   peak_gflops_per_second: 10609
 35 | 
 36 | RTX2070:
 37 |   compute_major: 7
 38 |   compute_minor: 5
 39 |   max_threads_per_block: 1024
 40 |   max_threads_per_multiprocessor: 1024
 41 |   regs_per_block: 65536
 42 |   regs_per_multiprocessor: 65536
 43 |   warp_size: 32
 44 |   shared_mem_per_block: 49152
 45 |   shared_mem_per_multiprocessor: 65536
 46 |   num_sms: 36
 47 |   shared_mem_per_block_optin: 0
 48 |   mem_bandwidth_gb: 383
 49 |   base_clock_mhz: 1410
 50 |   peak_gflops_per_second: 4318
 51 | 
 52 | RTX2080Ti:
 53 |   compute_major: 7
 54 |   compute_minor: 5
 55 |   max_threads_per_block: 1024
 56 |   max_threads_per_multiprocessor: 1024
 57 |   regs_per_block: 65536
 58 |   regs_per_multiprocessor: 65536
 59 |   warp_size: 32
 60 |   shared_mem_per_block: 49152
 61 |   shared_mem_per_multiprocessor: 65536
 62 |   num_sms: 68
 63 |   shared_mem_per_block_optin: 0
 64 |   mem_bandwidth_gb: 524
 65 |   base_clock_mhz: 1350
 66 |   peak_gflops_per_second: 5938
 67 | 
 68 | P4:
 69 |   compute_major: 6
 70 |   compute_minor: 1
 71 |   max_threads_per_block: 1024
 72 |   max_threads_per_multiprocessor: 2048
 73 |   regs_per_block: 65536
 74 |   regs_per_multiprocessor: 65536
 75 |   warp_size: 32
 76 |   shared_mem_per_block: 49152
 77 |   shared_mem_per_multiprocessor: 98304
 78 |   num_sms: 20
 79 |   shared_mem_per_block_optin: 0
 80 |   mem_bandwidth_gb: 192
 81 |   base_clock_mhz: 810
 82 |   peak_gflops_per_second: 4147
 83 | 
 84 | T4:
 85 |   compute_major: 7
 86 |   compute_minor: 5
 87 |   max_threads_per_block: 1024
 88 |   max_threads_per_multiprocessor: 1024
 89 |   regs_per_block: 65536
 90 |   regs_per_multiprocessor: 65536
 91 |   warp_size: 32
 92 |   shared_mem_per_block: 49152
 93 |   shared_mem_per_multiprocessor: 65536
 94 |   num_sms: 40
 95 |   shared_mem_per_block_optin: 0
 96 |   mem_bandwidth_gb: 239
 97 |   base_clock_mhz: 585
 98 |   peak_gflops_per_second: 1804
 99 | 
100 | V100:
101 |   compute_major: 7
102 |   compute_minor: 0
103 |   max_threads_per_block: 1024
104 |   max_threads_per_multiprocessor: 2048
105 |   regs_per_block: 65536
106 |   regs_per_multiprocessor: 65536
107 |   warp_size: 32
108 |   shared_mem_per_block: 49152
109 |   shared_mem_per_multiprocessor: 98304
110 |   num_sms: 80
111 |   shared_mem_per_block_optin: 0
112 |   mem_bandwidth_gb: 739
113 |   base_clock_mhz: 1312
114 |   peak_gflops_per_second: 6716
115 | 
116 | P100:
117 |   compute_major: 6
118 |   compute_minor: 0
119 |   max_threads_per_block: 1024
120 |   max_threads_per_multiprocessor: 2048
121 |   regs_per_block: 65536
122 |   regs_per_multiprocessor: 65536
123 |   warp_size: 32
124 |   shared_mem_per_block: 49152
125 |   shared_mem_per_multiprocessor: 65536
126 |   num_sms: 56
127 |   shared_mem_per_block_optin: 0
128 |   mem_bandwidth_gb: 501
129 |   base_clock_mhz: 1126
130 |   peak_gflops_per_second: 9504
131 | 


--------------------------------------------------------------------------------
/analyzer/habitat/data/verify.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | CHECKSUM_FILE="checksums"
 4 | declare -a FILES=(
 5 |   "bmm/model.pth"
 6 |   "conv2d/model.pth"
 7 |   "kernels.sqlite"
 8 |   "linear/model.pth"
 9 |   "lstm/model.pth"
10 | )
11 | 
12 | function generate() {
13 |   rm -f $CHECKSUM_FILE
14 |   for file in "${FILES[@]}"; do
15 |     shasum $file >> $CHECKSUM_FILE
16 |   done
17 |   echo "Done! Checksum file has been generated."
18 | }
19 | 
20 | function validate() {
21 |   shasum -c $CHECKSUM_FILE
22 | }
23 | 
24 | function usage() {
25 |   echo "Usage: $0 [-g | --generate]"
26 |   echo ""
27 |   echo "This utility checks that Habitat's data files have the correct contents."
28 |   echo ""
29 |   echo "Use the -g or --generate options to generate the checksum file."
30 |   exit 1
31 | }
32 | 
33 | if [ -z "$1" ]; then
34 |   validate
35 | elif [ "$1" = "-g" ] || [ "$1" = "--generate" ]; then
36 |   generate
37 | else
38 |   usage "$@"
39 | fi
40 | 


--------------------------------------------------------------------------------
/analyzer/habitat/profiling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoffxy/habitat/5f01e523a1dc30dbfbaaa39cf4880a534c7781a2/analyzer/habitat/profiling/__init__.py


--------------------------------------------------------------------------------
/analyzer/habitat/profiling/autograd.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from habitat.profiling.backward import get_grad_fn, flatten_operation_output
 4 | 
 5 | 
 6 | class AutogradEngine:
 7 |     """
 8 |     Emulates the backward pass for a given model output, for timing purposes.
 9 |     """
10 |     def __init__(self, grad_fn_ordering, input_map, initial_inputs):
11 |         self._grad_fn_ordering = grad_fn_ordering
12 |         self._input_holder = {
13 |             fn: [None] * size for fn, size in input_map.items()
14 |         }
15 |         self._input_holder[self._grad_fn_ordering[0]] = initial_inputs
16 | 
17 |     @classmethod
18 |     def new_from(cls, operation_output, exclude_accumulate_grad=True):
19 |         # Traverse the autograd graph, build input map for each grad_fn and
20 |         # create a topological ordering
21 |         _, initial_grad_fn = get_grad_fn(operation_output)
22 |         if initial_grad_fn is None:
23 |             raise ValueError('No grad_fn available on the operation output.')
24 | 
25 |         ordering = []
26 |         input_map = {}
27 |         initial_inputs = [
28 |             tensor.detach()
29 |             for tensor in flatten_operation_output(operation_output)
30 |         ]
31 |         input_map[initial_grad_fn] = len(initial_inputs)
32 | 
33 |         stack = [(initial_grad_fn, 0)]
34 |         visited = {initial_grad_fn}
35 | 
36 |         # Build a topological ordering
37 |         while len(stack) > 0:
38 |             grad_fn, visit_count = stack.pop()
39 |             if visit_count != 0:
40 |                 ordering.append(grad_fn)
41 |                 continue
42 | 
43 |             stack.append((grad_fn, 1))
44 |             for next_fn, input_idx in grad_fn.next_functions:
45 |                 if next_fn is None:
46 |                     continue
47 | 
48 |                 if (exclude_accumulate_grad and
49 |                         next_fn.name() == 'torch::autograd::AccumulateGrad'):
50 |                     continue
51 | 
52 |                 # Keep track of the inputs to each grad_fn
53 |                 if next_fn not in input_map:
54 |                     input_map[next_fn] = 1
55 |                 input_map[next_fn] = max(input_map[next_fn], input_idx + 1)
56 | 
57 |                 # Determine whether to visit this grad_fn
58 |                 if next_fn in visited:
59 |                     continue
60 | 
61 |                 visited.add(next_fn)
62 |                 stack.append((next_fn, 0))
63 | 
64 |         ordering.reverse()
65 |         return cls(ordering, input_map, initial_inputs)
66 | 
67 |     def run_backward(self):
68 |         for grad_fn in self._grad_fn_ordering:
69 |             # 1. Run the backward function
70 |             outputs = grad_fn(*(self._input_holder[grad_fn]))
71 | 
72 |             # 2. Store its outputs for the next backward function(s)
73 |             if isinstance(outputs, torch.Tensor):
74 |                 outputs = [outputs]
75 |             for (output, (next_fn, input_idx)) in zip(
76 |                     outputs, grad_fn.next_functions):
77 |                 if next_fn is None or next_fn not in self._input_holder:
78 |                     continue
79 |                 # NOTE: If implementing to actually calculate the gradient, we
80 |                 # need to sum gradients that "flow" into the same grad function
81 |                 # input.
82 |                 self._input_holder[next_fn][input_idx] = output
83 | 


--------------------------------------------------------------------------------
/analyzer/habitat/profiling/backward.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | 
  3 | 
  4 | class BackwardHelper:
  5 |     def __init__(self, backward_runnable, ag_dict):
  6 |         self.run_backward = backward_runnable
  7 |         self._ag_dict = ag_dict
  8 | 
  9 |     @classmethod
 10 |     def new_from(cls, operation_outputs):
 11 |         retval, initial_grad_fn = get_grad_fn(operation_outputs)
 12 |         if initial_grad_fn is None:
 13 |             raise ValueError('No grad_fn available on the operation output.')
 14 | 
 15 |         grads = torch.ones_like(retval)
 16 |         def backward_runnable():
 17 |             torch.autograd.backward(retval, grads, retain_graph=True)
 18 | 
 19 |         size_dict = get_accumulate_grad_inputs(
 20 |             initial_grad_fn,
 21 |             backward_runnable,
 22 |         )
 23 | 
 24 |         ag_dict = {
 25 |             grad_fn: torch.randn(size, device=torch.device('cuda'))
 26 |             for grad_fn, size in size_dict.items()
 27 |         }
 28 | 
 29 |         return cls(backward_runnable, ag_dict)
 30 | 
 31 |     def run_accumulate_grad(self):
 32 |         for grad_fn, grad in self._ag_dict.items():
 33 |             grad_fn(grad)
 34 | 
 35 | 
 36 | def backward_available(operation_output):
 37 |     return get_grad_fn(operation_output)[1] is not None
 38 | 
 39 | 
 40 | def flatten_operation_output(operation_output):
 41 |     if isinstance(operation_output, torch.Tensor):
 42 |         return [operation_output]
 43 |     elif (not isinstance(operation_output, tuple) and
 44 |           not isinstance(operation_output, list)):
 45 |         return []
 46 | 
 47 |     flattened = []
 48 |     for value in operation_output:
 49 |         flattened.extend(flatten_operation_output(value))
 50 |     return flattened
 51 | 
 52 | 
 53 | def get_grad_fn(retval):
 54 |     if isinstance(retval, torch.Tensor) and retval.grad_fn is not None:
 55 |         return retval, retval.grad_fn
 56 |     elif isinstance(retval, tuple) or isinstance(retval, list):
 57 |         for inner_value in retval:
 58 |             inner_retval, grad_fn = get_grad_fn(inner_value)
 59 |             if grad_fn is not None:
 60 |                 return inner_retval, grad_fn
 61 | 
 62 |     return None, None
 63 | 
 64 | 
 65 | def get_accumulate_grad_inputs(initial_grad_fn, backward_runnable):
 66 |     input_dict = {}
 67 |     hook_handles = []
 68 |     def get_hook(grad_fn):
 69 |         def hook(arg1, arg2):
 70 |             if not isinstance(arg2[0], torch.Tensor):
 71 |                 return
 72 |             input_dict[grad_fn] = arg2[0].size()
 73 |         return hook
 74 | 
 75 |     # Traverse the graph to identify all AccumulateGrad functions
 76 |     stack = [initial_grad_fn]
 77 |     visited = {initial_grad_fn}
 78 | 
 79 |     while len(stack) > 0:
 80 |         grad_fn = stack.pop()
 81 | 
 82 |         if grad_fn.name() == 'torch::autograd::AccumulateGrad':
 83 |             hook_handles.append(grad_fn.register_hook(get_hook(grad_fn)))
 84 | 
 85 |         for next_grad_fn, _ in grad_fn.next_functions:
 86 |             if next_grad_fn is None or next_grad_fn in visited:
 87 |                 continue
 88 |             stack.append(next_grad_fn)
 89 |             visited.add(next_grad_fn)
 90 | 
 91 |     # Run a backward pass to get accumulate grad sizes
 92 |     backward_runnable()
 93 |     torch.cuda.synchronize()
 94 | 
 95 |     # Clear hooks
 96 |     for handle in hook_handles:
 97 |         handle.remove()
 98 | 
 99 |     return input_dict
100 | 


--------------------------------------------------------------------------------
/analyzer/habitat/profiling/kernel.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import habitat.habitat_cuda as hc
 3 | 
 4 | from habitat.analysis import SPECIAL_OPERATIONS
 5 | from habitat.analysis.metrics import resolve_metrics
 6 | from habitat.analysis.kernels import MeasuredKernel
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | class KernelProfiler:
12 |     def __init__(self, device, metrics=None, metrics_threshold_ms=0):
13 |         self._device = device
14 |         self._metrics = resolve_metrics(metrics, self._device)
15 |         self._metrics_threshold_ns = metrics_threshold_ms * 1000000
16 | 
17 |     def measure_kernels(self, runnable, func_name=None):
18 |         """
19 |         Uses CUPTI to measure the kernels launched by runnable.
20 | 
21 |         Returns:
22 |           A list of MeasuredKernels
23 |         """
24 |         if func_name is None:
25 |             fname = (
26 |                 runnable.__name__ if hasattr(runnable, "__name__")
27 |                 else "Unnamed"
28 |             )
29 |         else:
30 |             fname = func_name
31 | 
32 |         return list(map(
33 |             lambda ks: MeasuredKernel(ks[0], ks[1], self._device),
34 |             self._measure_kernels_raw(runnable, fname)
35 |         ))
36 | 
37 |     def _measure_kernels_raw(self, runnable, func_name):
38 |         """
39 |         Uses CUPTI to measure the kernels launched by runnable.
40 | 
41 |         Returns:
42 |           A list of tuples, where
43 |             - tuple[0] is the raw kernel measurement that should be used for
44 |               the kernel's run time
45 |             - tuple[1] is a list of the raw kernel measurements that contain
46 |               the metrics requested
47 |         """
48 |         time_kernels = hc.profile(runnable)
49 |         if (len(self._metrics) == 0 or
50 |                 func_name in SKIP_METRICS or
51 |                 func_name in SPECIAL_OPERATIONS or
52 |                 self._under_threshold(time_kernels)):
53 |             return list(map(lambda tk: (tk, []), time_kernels))
54 | 
55 |         try:
56 |             metric_kernels = [
57 |                 hc.profile(runnable, metric) for metric in self._metrics
58 |             ]
59 |             # Make sure the same number of kernels are recorded for each metric
60 |             assert all(map(
61 |                 lambda ks: len(ks) == len(metric_kernels[0]),
62 |                 metric_kernels,
63 |             ))
64 |             # metric_kernels is originally (# metrics x # kernels in op)
65 |             # we need to transpose it to become (# kernels in op x # metrics)
66 |             # so that we can join kernels with their metrics.
67 |             transposed = map(list, zip(*metric_kernels))
68 |             # We return a list of (time kernel, [metric kernels])
69 |             return list(zip(time_kernels, transposed))
70 |         except RuntimeError as ex:
71 |             logger.warn(
72 |                 'Metrics error "%s" for function "%s".',
73 |                 str(ex),
74 |                 func_name,
75 |             )
76 |             return list(map(lambda tk: (tk, []), time_kernels))
77 | 
78 |     def _under_threshold(self, kernels):
79 |         # If under threshold, don't measure metrics
80 |         return (
81 |             sum(map(lambda k: k.run_time_ns, kernels))
82 |             <= self._metrics_threshold_ns
83 |         )
84 | 
85 | 
86 | SKIP_METRICS = {
87 |     "detach_",
88 | }
89 | 


--------------------------------------------------------------------------------
/analyzer/habitat/profiling/run_time.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class RunTimeProfiler:
 5 |     def __init__(self, warm_up=3, measure_for=3):
 6 |         self._warm_up = warm_up
 7 |         self._measure_for = measure_for
 8 |         self._start_event = torch.cuda.Event(enable_timing=True)
 9 |         self._end_event = torch.cuda.Event(enable_timing=True)
10 | 
11 |     def measure_ms(self, runnable):
12 |         for _ in range(self._warm_up):
13 |             runnable()
14 | 
15 |         self._start_event.record()
16 |         for _ in range(self._measure_for):
17 |             runnable()
18 |         self._end_event.record()
19 |         torch.cuda.synchronize()
20 | 
21 |         return (
22 |             self._start_event.elapsed_time(self._end_event) / self._measure_for
23 |         )
24 | 


--------------------------------------------------------------------------------
/analyzer/habitat/tracking/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoffxy/habitat/5f01e523a1dc30dbfbaaa39cf4880a534c7781a2/analyzer/habitat/tracking/__init__.py


--------------------------------------------------------------------------------
/analyzer/habitat/tracking/base.py:
--------------------------------------------------------------------------------
 1 | import contextlib
 2 | 
 3 | 
 4 | class TrackerBase:
 5 |     def __init__(self):
 6 |         self._is_tracking = False
 7 | 
 8 |     @contextlib.contextmanager
 9 |     def track(self):
10 |         self.start_tracking()
11 |         try:
12 |             yield self
13 |         finally:
14 |             self.stop_tracking()
15 | 
16 |     def start_tracking(self):
17 |         self._is_tracking = True
18 | 
19 |     def stop_tracking(self):
20 |         self._is_tracking = False
21 | 


--------------------------------------------------------------------------------
/analyzer/habitat/tracking/callable.py:
--------------------------------------------------------------------------------
  1 | import inspect
  2 | 
  3 | import torch
  4 | 
  5 | from habitat.tracking.base import TrackerBase
  6 | from habitat.tracking.hook_manager import HookManager
  7 | 
  8 | 
  9 | class CallableTracker(TrackerBase):
 10 |     def __init__(self, hook_creator):
 11 |         super().__init__()
 12 |         self._hook_manager = HookManager()
 13 |         self._hook_creator = hook_creator
 14 | 
 15 |     def start_tracking(self):
 16 |         super().start_tracking()
 17 |         self._hook_manager.attach_hooks_on_module(
 18 |             torch,
 19 |             lambda fn: _is_callable_and_public(fn) and \
 20 |               fn.__name__ not in BLACKLISTED_TORCH_METHODS,
 21 |             self._hook_creator,
 22 |         )
 23 |         self._hook_manager.attach_hooks_on_module(
 24 |             torch.Tensor,
 25 |             lambda fn: _is_callable_and_public(fn) and \
 26 |               fn.__name__ != 'backward' and \
 27 |               fn.__name__ not in BLACKLISTED_TENSOR_METHODS,
 28 |             self._hook_creator,
 29 |         )
 30 |         self._hook_manager.attach_hooks_on_module(
 31 |             torch.Tensor,
 32 |             _is_callable_dunder,
 33 |             self._hook_creator,
 34 |         )
 35 |         self._hook_manager.attach_hooks_on_module(
 36 |             torch.nn.functional,
 37 |             _is_callable_and_public,
 38 |             self._hook_creator,
 39 |         )
 40 |         self._hook_manager.attach_hooks_on_module_using(
 41 |             torch.nn._VF,
 42 |             torch._C._VariableFunctions,
 43 |             _is_callable_and_public,
 44 |             self._hook_creator,
 45 |         )
 46 | 
 47 |     def stop_tracking(self):
 48 |         super().stop_tracking()
 49 |         self._hook_manager.remove_hooks()
 50 | 
 51 | 
 52 | def _is_callable_and_public(maybe_fn):
 53 |     # By convention, _ prefixed functions in Python should not be
 54 |     # called by users (i.e. they are "private" functions)
 55 |     return _is_callable(maybe_fn) and maybe_fn.__name__[0] != '_'
 56 | 
 57 | # Original source of these blacklists:
 58 | # https://github.com/NVIDIA/apex/blob/master/apex/pyprof/nvtx/nvmarker.py
 59 | BLACKLISTED_DUNDERS = {
 60 |     '__all__',
 61 |     '__array__',
 62 |     '__array_priority__',
 63 |     '__array_wrap__',
 64 |     '__bool__',
 65 |     '__builtins__',
 66 |     '__cached__',
 67 |     '__class__',
 68 |     '__deepcopy__',
 69 |     '__delattr__',
 70 |     '__delitem__',
 71 |     '__dict__',
 72 |     '__dir__',
 73 |     '__doc__',
 74 |     '__file__',
 75 |     '__format__',
 76 |     '__getattribute__',
 77 |     '__getitem__',
 78 |     '__hash__',
 79 |     '__index__',
 80 |     '__init__',
 81 |     '__init_subclass__',
 82 |     '__iter__',
 83 |     '__len__',
 84 |     '__loader__',
 85 |     '__module__',
 86 |     '__name__',
 87 |     '__new__',
 88 |     '__nonzero__',
 89 |     '__package__',
 90 |     '__path__',
 91 |     '__reduce__',
 92 |     '__reduce_ex__',
 93 |     '__repr__',
 94 |     '__reversed__',
 95 |     '__setattr__',
 96 |     '__setitem__',
 97 |     '__setstate__',
 98 |     '__sizeof__',
 99 |     '__spec__',
100 |     '__str__',
101 |     '__subclasshook__',
102 |     '__version__',
103 |     '__weakref__',
104 | }
105 | 
106 | BLACKLISTED_TENSOR_METHODS = {
107 |     'size', 'dim', 'item', 'tolist',
108 | }
109 | 
110 | BLACKLISTED_TORCH_METHODS = {
111 |     'is_storage',
112 | }
113 | 
114 | 
115 | def _is_callable_dunder(maybe_fn):
116 |     """
117 |     Returns True if maybe_fn is a callable dunder (callable named with double
118 |     underscores) (e.g., __add__)
119 |     """
120 |     return (
121 |         _is_callable(maybe_fn) and
122 |         len(maybe_fn.__name__) > 4 and
123 |         maybe_fn.__name__[:2] == '__' and
124 |         maybe_fn.__name__[-2:] == '__' and
125 |         maybe_fn.__name__ not in BLACKLISTED_DUNDERS
126 |     )
127 | 
128 | 
129 | def _is_callable(maybe_fn):
130 |     return (
131 |         inspect.isfunction(maybe_fn) or
132 |         inspect.ismethod(maybe_fn) or
133 |         inspect.isbuiltin(maybe_fn) or
134 |         inspect.isroutine(maybe_fn)
135 |     )
136 | 


--------------------------------------------------------------------------------
/analyzer/habitat/tracking/hook_manager.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class HookManager:
 4 |     def __init__(self):
 5 |         self._original_callables = {}
 6 | 
 7 |     def attach_hooks_on_module(self, module, predicate, hook_creator):
 8 |         self.attach_hooks_on_module_using(
 9 |             module, module, predicate, hook_creator)
10 | 
11 |     def attach_hooks_on_module_using(
12 |             self, module, using_module, predicate, hook_creator):
13 |         """
14 |         Attach hooks onto functions in the provided module. Use the
15 |         `using_module` to discover the existing functions.
16 |         """
17 |         for prop in dir(using_module):
18 |             if not predicate(getattr(module, prop)):
19 |                 continue
20 |             self.attach_hook(module, prop, hook_creator)
21 | 
22 |     def attach_hook(self, module, prop, hook_creator):
23 |         target = getattr(module, prop)
24 |         self._maybe_store_callable(module, prop, target)
25 |         setattr(module, prop, hook_creator(target))
26 | 
27 |     def remove_hooks(self):
28 |         for module, callable_pairs in self._original_callables.items():
29 |             for prop, original_callable in callable_pairs.items():
30 |                 setattr(module, prop, original_callable)
31 |         self._original_callables.clear()
32 | 
33 |     def _maybe_store_callable(self, module, prop, original_callable):
34 |         """
35 |         Store the original callable (to be able to restore it) only when it is
36 |         the first time we are encountering the given callable.
37 |         """
38 |         if module not in self._original_callables:
39 |             self._original_callables[module] = {}
40 | 
41 |         if prop in self._original_callables[module]:
42 |             return
43 | 
44 |         self._original_callables[module][prop] = original_callable
45 | 


--------------------------------------------------------------------------------
/analyzer/habitat/tracking/operation.py:
--------------------------------------------------------------------------------
 1 | from habitat.analysis import SPECIAL_OPERATIONS
 2 | from habitat.analysis.arguments import Arguments
 3 | from habitat.analysis.operation import MeasuredOperation
 4 | from habitat.analysis.trace import Trace
 5 | from habitat.profiling.operation import OperationProfiler
 6 | from habitat.tracking.base import TrackerBase
 7 | from habitat.tracking.callable import CallableTracker
 8 | 
 9 | 
10 | class OperationTracker(TrackerBase):
11 |     def __init__(self, device, metrics=None, metrics_threshold_ms=0):
12 |         super().__init__()
13 |         self._device = device
14 |         self._callable_tracker = CallableTracker(self._hook_creator)
15 |         self._profiler = OperationProfiler(
16 |             device,
17 |             metrics,
18 |             metrics_threshold_ms,
19 |         )
20 |         self._processing_hook = False
21 | 
22 |         self._operations = []
23 | 
24 |     def start_tracking(self):
25 |         super().start_tracking()
26 |         self._callable_tracker.start_tracking()
27 | 
28 |     def stop_tracking(self):
29 |         super().stop_tracking()
30 |         self._callable_tracker.stop_tracking()
31 | 
32 |     def get_tracked_trace(self):
33 |         return Trace(self._device, self._operations)
34 | 
35 |     def _hook_creator(self, func):
36 |         def hook(*args, **kwargs):
37 |             # NOTE: We use self._processing_hook to handle cases where we have
38 |             #       hooks on nested function calls.
39 |             if self._processing_hook:
40 |                 return func(*args, **kwargs)
41 | 
42 |             self._processing_hook = True
43 |             try:
44 |                 # We only track the arguments if the operation is "special"
45 |                 # (i.e. we use special handling to scale it to a different
46 |                 # device).
47 |                 is_special_op = func.__name__ in SPECIAL_OPERATIONS
48 |                 arguments = (
49 |                     Arguments.from_raw_arguments(args, kwargs)
50 |                     if is_special_op else None
51 |                 )
52 | 
53 |                 if (func.__name__ == 'lstm' and
54 |                         isinstance(arguments.args[4], bool)):
55 |                     # Special case - we need this information for the lstm
56 |                     # operation
57 |                     arguments.special['batch_sizes'] = args[1].tolist()
58 | 
59 |                 forward, backward = self._profiler.measure_operation(
60 |                     func,
61 |                     args,
62 |                     kwargs,
63 |                 )
64 |                 self._operations.append(MeasuredOperation(
65 |                     name=func.__name__,
66 |                     arguments=arguments,
67 |                     forward=forward,
68 |                     backward=backward,
69 |                     device=self._device,
70 |                 ))
71 | 
72 |                 # Actually run the hooked function
73 |                 return func(*args, **kwargs)
74 |             finally:
75 |                 self._processing_hook = False
76 | 
77 |         return hook
78 | 


--------------------------------------------------------------------------------
/analyzer/habitat/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import yaml
 4 | 
 5 | 
 6 | def set_up_logging():
 7 |     logging.basicConfig(
 8 |         level=logging.INFO,
 9 |         format='%(asctime)s %(levelname)-8s %(message)s',
10 |         datefmt='%Y-%m-%d %H:%M',
11 |     )
12 | 
13 | 
14 | def add_common_cmd_args(parser):
15 |     parser.add_argument('model_path', help='The serialized model to analyze')
16 |     parser.add_argument(
17 |         'model_config_path',
18 |         help='The configuration file for the model',
19 |     )
20 |     parser.add_argument(
21 |         '--device-config',
22 |         type=str,
23 |         default='devices.yml',
24 |         help='The config file containing GPU device properties.',
25 |     )
26 |     parser.add_argument(
27 |         '--origin-device',
28 |         type=str,
29 |         required=True,
30 |         help='The GPU on which the analysis is being performed.',
31 |     )
32 |     parser.add_argument(
33 |         '--kernel-lut',
34 |         type=str,
35 |         default=os.path.join('lutfiles', 'kernels.sqlite'),
36 |         help='The path to the kernel metadata look up table.',
37 |     )
38 |     parser.add_argument(
39 |         '--operation-lut',
40 |         type=str,
41 |         default=os.path.join('lutfiles', 'operations.sqlite'),
42 |         help='The path to the operation run time look up table.',
43 |     )
44 | 
45 | 
46 | def ns_to_ms(ns):
47 |     return ns / 1e6
48 | 
49 | 
50 | def ms_to_ns(ms):
51 |     return ms * 1e6
52 | 
53 | 
54 | def name_all_arguments(all_parameters, args, kwargs):
55 |     """
56 |     This function merges positional and keyword arguments
57 |     into one dictionary based on the declared names of the
58 |     function's parameters.
59 |     """
60 |     merged = {**kwargs}
61 |     for arg_name, arg in zip(all_parameters, args):
62 |         merged[arg_name] = arg
63 |     return merged
64 | 


--------------------------------------------------------------------------------
/analyzer/install-dev.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | SO_NAME="habitat_cuda.cpython-36m-x86_64-linux-gnu.so"
 4 | PACKAGE_NAME="habitat-predict"
 5 | 
 6 | # Operate out of the script directory
 7 | SCRIPT_PATH=$(cd $(dirname $0) && pwd -P)
 8 | cd $SCRIPT_PATH
 9 | 
10 | # Abort if an error occurs
11 | set -e
12 | 
13 | function pushd() {
14 |   command pushd "$@" > /dev/null
15 | }
16 | 
17 | function popd() {
18 |   command popd "$@" > /dev/null
19 | }
20 | 
21 | function compile_habitat_cuda() {
22 |   echo "Compiling the Habitat C++ extension..."
23 |   pushd ../cpp
24 |   mkdir -p build
25 |   pushd build
26 | 
27 |   cmake -DCMAKE_BUILD_TYPE=Release ..
28 |   make -j 8 habitat_cuda
29 | 
30 |   if [ ! -f $SO_NAME ]; then
31 |     echo "ERROR: Could not find $SO_NAME after compilation. Please double "
32 |     echo "check that compilation completed successfully."
33 |     exit 1
34 |   fi
35 | 
36 |   popd
37 |   popd
38 |   echo ""
39 | }
40 | 
41 | function symlink_habitat_cuda() {
42 |   echo "Adding a symbolic link to the Habitat C++ extension..."
43 |   if [ ! -h habitat/$SO_NAME ]; then
44 |     ln -s ../../cpp/build/$SO_NAME habitat
45 |   fi
46 |   echo ""
47 | }
48 | 
49 | function install_habitat() {
50 |   echo "Install an editable version of the Habitat package..."
51 |   pip3 install --editable .
52 |   echo ""
53 | }
54 | 
55 | function uninstall_habitat() {
56 |   pip3 uninstall $PACKAGE_NAME
57 | }
58 | 
59 | function check_prereqs() {
60 |   if [ -z $(which cmake) ]; then
61 |     echo "Please ensure cmake 3.17+ is installed."
62 |     exit 1
63 |   fi
64 |   if [ -z $(which make) ]; then
65 |     echo "Please ensure make is installed."
66 |   fi
67 |   if [ -z $(which pip3) ]; then
68 |     echo "Please ensure pip3 is installed."
69 |     exit 1
70 |   fi
71 | }
72 | 
73 | function main() {
74 |   if [ "$1" = "--uninstall" ]; then
75 |     uninstall_habitat
76 |   else
77 |     check_prereqs
78 |     compile_habitat_cuda
79 |     symlink_habitat_cuda
80 |     install_habitat
81 |   fi
82 | }
83 | 
84 | main $@
85 | 


--------------------------------------------------------------------------------
/analyzer/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=40.6.0", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 | 


--------------------------------------------------------------------------------
/analyzer/setup.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import os
  3 | import re
  4 | import sys
  5 | 
  6 | from setuptools import setup, find_packages
  7 | 
  8 | # Acknowledgement: This setup.py was adapted from Hynek Schlawack's Python
  9 | #                  Packaging Guide
 10 | # https://hynek.me/articles/sharing-your-labor-of-love-pypi-quick-and-dirty
 11 | 
 12 | ###################################################################
 13 | 
 14 | NAME = "habitat-predict"
 15 | PACKAGES = find_packages()
 16 | META_PATH = os.path.join("habitat", "__init__.py")
 17 | README_PATH = "README.md"
 18 | PYTHON_REQUIRES = ">=3.6"
 19 | 
 20 | PACKAGE_DATA = {
 21 |     "habitat": [
 22 |         "data/hints.yml",
 23 |         "data/bmm/model.pth",
 24 |         "data/conv2d/model.pth",
 25 |         "data/kernels.sqlite",
 26 |         "data/linear/model.pth",
 27 |         "data/lstm/model.pth",
 28 |         "habitat_cuda.cpython-36m-x86_64-linux-gnu.so",
 29 |     ],
 30 | }
 31 | 
 32 | INSTALL_REQUIRES = [
 33 | #    "pyyaml",
 34 | #    "torch>=1.4.0",
 35 |     "pandas>=1.1.2",
 36 |     "tqdm>=4.49.0"
 37 | ]
 38 | 
 39 | KEYWORDS = [
 40 |     "neural networks",
 41 |     "pytorch",
 42 |     "performance",
 43 |     "profiler",
 44 |     "predictions",
 45 | ]
 46 | 
 47 | CLASSIFIERS = [
 48 |     "Do Not Upload",
 49 |     "Development Status :: 3 - Alpha",
 50 |     "Intended Audience :: Developers",
 51 |     "License :: OSI Approved :: Apache Software License",
 52 |     "Programming Language :: Python :: 3 :: Only",
 53 | ]
 54 | 
 55 | ###################################################################
 56 | 
 57 | HERE = os.path.abspath(os.path.dirname(__file__))
 58 | 
 59 | 
 60 | def read(*parts):
 61 |     """
 62 |     Build an absolute path from *parts* and return the contents of the
 63 |     resulting file. Assume UTF-8 encoding.
 64 |     """
 65 |     with codecs.open(os.path.join(HERE, *parts), "rb", "utf-8") as f:
 66 |         return f.read()
 67 | 
 68 | 
 69 | META_FILE = read(META_PATH)
 70 | 
 71 | 
 72 | def find_meta(meta):
 73 |     """
 74 |     Extract __*meta*__ from META_FILE.
 75 |     """
 76 |     meta_match = re.search(
 77 |         r"^__{meta}__ = ['\"]([^'\"]*)['\"]".format(meta=meta),
 78 |         META_FILE, re.M
 79 |     )
 80 |     if meta_match:
 81 |         return meta_match.group(1)
 82 |     raise RuntimeError("Unable to find __{meta}__ string.".format(meta=meta))
 83 | 
 84 | 
 85 | if __name__ == "__main__":
 86 |     setup(
 87 |         name=NAME,
 88 |         description=find_meta("description"),
 89 |         license=find_meta("license"),
 90 |         version=find_meta("version"),
 91 |         author=find_meta("author"),
 92 |         author_email=find_meta("email"),
 93 |         maintainer=find_meta("author"),
 94 |         maintainer_email=find_meta("email"),
 95 |         long_description=read(README_PATH),
 96 |         long_description_content_type="text/markdown",
 97 |         packages=PACKAGES,
 98 |         package_data=PACKAGE_DATA,
 99 |         python_requires=PYTHON_REQUIRES,
100 |         install_requires=INSTALL_REQUIRES,
101 |         classifiers=CLASSIFIERS,
102 |         keywords=KEYWORDS,
103 |     )
104 | 


--------------------------------------------------------------------------------
/cpp/.gitignore:
--------------------------------------------------------------------------------
1 | # CMake build directories
2 | build
3 | debug
4 | cmake-build-debug
5 | 
6 | # CLion project directory
7 | .idea
8 | 


--------------------------------------------------------------------------------
/cpp/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.17 FATAL_ERROR)
 2 | project(habitat LANGUAGES C CXX CUDA)
 3 | 
 4 | # Include our custom find module files
 5 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 6 | 
 7 | # Find all dependencies
 8 | find_package(CUDAToolkit REQUIRED)
 9 | # We need CUPTI here because CUDAToolkit does not include the nvperf-related
10 | # libraries and headers that we need.
11 | find_package(CUPTI REQUIRED)
12 | find_package(NVPerf REQUIRED)
13 | add_subdirectory(external)
14 | 
15 | ################################################################################
16 | # Habitat Targets
17 | ################################################################################
18 | 
19 | # habitat_cuda: Builds a Python importable module that provides bindings to the
20 | #               CUDA-related functionality used by Habitat.
21 | set(HabitatCUDA "habitat_cuda")
22 | pybind11_add_module(${HabitatCUDA} src/habitat_cuda.cpp)
23 | set_property(TARGET ${HabitatCUDA} PROPERTY CXX_STANDARD 11)
24 | 
25 | # device_info: Builds a utility executable that prints information about the
26 | #              underlying GPU device (e.g., number of SMs, memory bandwidth).
27 | set(DeviceInfo "device_info")
28 | add_executable(${DeviceInfo} src/device_info.cpp)
29 | set_property(TARGET ${DeviceInfo} PROPERTY CXX_STANDARD 11)
30 | 
31 | ################################################################################
32 | 
33 | # Add our source files
34 | cmake_policy(SET CMP0076 NEW)
35 | add_subdirectory(src)
36 | 
37 | # Specify dependencies for each target
38 | target_link_libraries(${HabitatCUDA} PRIVATE habitat-cuda-lib)
39 | set(HabitatCUDATransitiveDeps habitat-cuda-lib cupti_profilerhost_util)
40 | target_link_libraries(${DeviceInfo} PRIVATE CUDA::cudart gflags::gflags)
41 | 
42 | # Since pybind11 modules are shared libraries, all the static libraries it
43 | # depends on must be compiled as position independent code.
44 | foreach(LIB ${HabitatCUDATransitiveDeps})
45 |   set_property(TARGET ${LIB} PROPERTY POSITION_INDEPENDENT_CODE ON)
46 | endforeach()
47 | 
48 | # Turn on all compile warnings
49 | set(AllTargets ${HabitatCUDA})
50 | foreach(TGT ${AllTargets})
51 |   if(CMAKE_COMPILER_IS_GNUCC)
52 |     target_compile_options(${TGT} PRIVATE "-Wall")
53 |   endif()
54 | endforeach()
55 | 


--------------------------------------------------------------------------------
/cpp/README.md:
--------------------------------------------------------------------------------
 1 | Habitat C++ Sources
 2 | ===================
 3 | This directory contains the C++ source code for Habitat. All the C++ code is
 4 | kept in this unified directory to simplify code sharing. CMake is used to
 5 | compile the code.
 6 | 
 7 | The C++ code is currently used to build two targets:
 8 | 
 9 | - `habitat_cuda`: A Python-importable module that provides bindings to the
10 |   CUDA-related functionality used by Habitat.
11 | - `device_info`: A utility that prints information about the underlying GPU
12 |   (e.g., number of SMs, memory bandwidth, etc.).
13 | 
14 | Code Organization
15 | -----------------
16 | Each target corresponds to one file under the `src` directory. The rest of the
17 | supporting code is organized in subdirectories under `src`.
18 | 
19 | If code is shared among multiple targets, it will likely be organized into a
20 | independently compiled library that can be linked to the needed targets.
21 | 


--------------------------------------------------------------------------------
/cpp/cmake/FindCUPTI.cmake:
--------------------------------------------------------------------------------
 1 | #[=======================================================================[.rst:
 2 | FindCUPTI
 3 | ---------
 4 | 
 5 | Finds the CUPTI library.
 6 | 
 7 | To specify a custom location, set the ``CUPTI_DIR`` environment variable.
 8 | This find module file will look under ``$CUPTI_DIR/include`` and ``$CUPTI_DIR/lib/x64``
 9 | for the headers and CUPTI shared library respectively.
10 | 
11 | Result Variables
12 | ^^^^^^^^^^^^^^^^
13 | 
14 | This will define the following variables:
15 | 
16 | ``CUPTI_FOUND``
17 |   True if the system has CUPTI.
18 | ``CUPTI_INCLUDE_DIRS``
19 |   Include directories needed to use CUPTI.
20 | ``CUPTI_LIBRARIES``
21 |   Libraries needed to link to CUPTI.
22 | 
23 | Cache Variables
24 | ^^^^^^^^^^^^^^^
25 | 
26 | The following cache variables may also be set:
27 | 
28 | ``CUPTI_INCLUDE_DIR``
29 |   The directory containing ``cupti.h``.
30 | ``CUPTI_LIBRARY``
31 |   The path to the CUPTI library.
32 | 
33 | #]=======================================================================]
34 | 
35 | include(FindPackageHandleStandardArgs)
36 | 
37 | if(DEFINED ENV{CUDA_HOME})
38 |   SET(CUPTI_CUDA_HOME "$ENV{CUDA_HOME}")
39 | endif()
40 | 
41 | if(DEFINED ENV{CUPTI_DIR})
42 |   SET(CUPTI_DIR "$ENV{CUPTI_DIR}")
43 | endif()
44 | 
45 | find_path(CUPTI_INCLUDE_DIR cupti.h
46 |   HINTS
47 |   ${CUPTI_DIR}/include
48 |   ${CUPTI_CUDA_HOME}/extras/CUPTI/include
49 |   /usr/local/cuda/extras/CUPTI/include
50 | )
51 | 
52 | find_library(CUPTI_LIBRARY cupti
53 |   HINTS
54 |   ${CUPTI_DIR}/lib/x64
55 |   ${CUPTI_CUDA_HOME}/extras/CUPTI/lib64
56 |   /usr/local/cuda/extras/CUPTI/lib64
57 | )
58 | 
59 | find_package_handle_standard_args(CUPTI
60 |   DEFAULT_MSG
61 |   CUPTI_INCLUDE_DIR
62 |   CUPTI_LIBRARY
63 | )
64 | 
65 | if(CUPTI_FOUND)
66 |   set(CUPTI_INCLUDE_DIRS ${CUPTI_INCLUDE_DIR})
67 |   set(CUPTI_LIBRARIES ${CUPTI_LIBRARY})
68 | 
69 |   message(STATUS "Found CUPTI includes: ${CUPTI_INCLUDE_DIRS}")
70 |   message(STATUS "Found CUPTI library: ${CUPTI_LIBRARIES}")
71 | endif()
72 | 


--------------------------------------------------------------------------------
/cpp/cmake/FindNVPerf.cmake:
--------------------------------------------------------------------------------
 1 | #[=======================================================================[.rst:
 2 | FindNVPerf
 3 | ---------
 4 | 
 5 | Finds the NVPerf library.
 6 | 
 7 | Note that NVPerf is usually distributed with CUPTI. Therefore to specify a
 8 | custom location, set the ``CUPTI_DIR`` environment variable. This find module
 9 | file will look under ``$CUPTI_DIR/lib/x64`` for the NVPerf shared library.
10 | 
11 | Result Variables
12 | ^^^^^^^^^^^^^^^^
13 | 
14 | This will define the following variables:
15 | 
16 | ``NVPerf_FOUND``
17 |   True if the system has NVPerf.
18 | ``NVPerf_LIBRARIES``
19 |   Libraries needed to link to NVPerf.
20 | 
21 | Cache Variables
22 | ^^^^^^^^^^^^^^^
23 | 
24 | The following cache variables may also be set:
25 | 
26 | ``NVPerf_HOST_LIBRARY``
27 |   The path to the NVPerf host library.
28 | ``NVPerf_TARGET_LIBRARY``
29 |   The path to the NVPerf target library.
30 | 
31 | #]=======================================================================]
32 | 
33 | include(FindPackageHandleStandardArgs)
34 | 
35 | if(DEFINED ENV{CUDA_HOME})
36 |   SET(CUPTI_CUDA_HOME "$ENV{CUDA_HOME}")
37 | endif()
38 | 
39 | if(DEFINED ENV{CUPTI_DIR})
40 |   SET(CUPTI_DIR "$ENV{CUPTI_DIR}")
41 | endif()
42 | 
43 | find_library(NVPerf_HOST_LIBRARY nvperf_host
44 |   HINTS
45 |   ${CUPTI_DIR}/lib/x64
46 |   ${CUPTI_CUDA_HOME}/extras/CUPTI/lib64
47 |   /usr/local/cuda/extras/CUPTI/lib64
48 | )
49 | 
50 | find_library(NVPerf_TARGET_LIBRARY nvperf_target
51 |   HINTS
52 |   ${CUPTI_DIR}/lib/x64
53 |   ${CUPTI_CUDA_HOME}/extras/CUPTI/lib64
54 |   /usr/local/cuda/extras/CUPTI/lib64
55 | )
56 | 
57 | find_package_handle_standard_args(NVPerf
58 |   DEFAULT_MSG
59 |   NVPerf_HOST_LIBRARY
60 |   NVPerf_TARGET_LIBRARY
61 | )
62 | 
63 | if(NVPerf_FOUND)
64 |   set(NVPerf_LIBRARIES ${NVPerf_HOST_LIBRARY} ${NVPerf_TARGET_LIBRARY})
65 |   message(STATUS "Found NVPerf libraries: ${NVPerf_LIBRARIES}")
66 | endif()
67 | 


--------------------------------------------------------------------------------
/cpp/external/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(cupti_profilerhost_util)
2 | add_subdirectory(gflags)
3 | add_subdirectory(pybind11)
4 | 


--------------------------------------------------------------------------------
/cpp/external/cupti_profilerhost_util/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(CuptiProfilerHost "cupti_profilerhost_util")
 2 | 
 3 | add_library(${CuptiProfilerHost}
 4 |   src/profilerhost_util/Eval.cpp
 5 |   src/profilerhost_util/List.cpp
 6 |   src/profilerhost_util/Metric.cpp
 7 | )
 8 | 
 9 | target_link_libraries(${CuptiProfilerHost}
10 |   PRIVATE
11 |     CUDA::cupti
12 |     ${NVPerf_LIBRARIES}
13 | )
14 | 
15 | target_include_directories(${CuptiProfilerHost}
16 |   PUBLIC
17 |     ${CMAKE_CURRENT_SOURCE_DIR}/include/c_util
18 |     ${CMAKE_CURRENT_SOURCE_DIR}/include/profilerhost_util
19 |   PRIVATE
20 |     # We need to add this in addition to the CUDA::cupti
21 |     # target above because this variable includes the
22 |     # nvperf headers (and the CUDA::cupti target does not).
23 |     ${CUPTI_INCLUDE_DIRS}
24 | )
25 | 


--------------------------------------------------------------------------------
/cpp/external/cupti_profilerhost_util/include/c_util/FileOp.h:
--------------------------------------------------------------------------------
 1 | #include <fstream>
 2 | #include <vector>
 3 | #include <iostream>
 4 | 
 5 | bool WriteBinaryFile(const char* pFileName, const std::vector<uint8_t>& data)
 6 | {
 7 |     FILE* fp = fopen(pFileName, "wb");
 8 |     if (fp)
 9 |     {
10 |         if (data.size())
11 |         {
12 |             fwrite(&data[0], 1, data.size(), fp);
13 |         }
14 |         fclose(fp);
15 |     }
16 |     else
17 |     {
18 |         std::cout << "Failed to open " << pFileName << "\n";
19 |         fclose(fp);
20 |         return false;
21 |     }
22 |     return true;
23 | }
24 | 
25 | bool ReadBinaryFile(const char* pFileName, std::vector<uint8_t>& image)
26 | {
27 |     FILE* fp = fopen(pFileName, "rb");
28 |     if (!fp)
29 |     {
30 |         std::cout << "Failed to open " << pFileName << "\n";
31 |         return false;
32 |     }
33 | 
34 |     fseek(fp, 0, SEEK_END);
35 |     const long fileLength = ftell(fp);
36 |     fseek(fp, 0, SEEK_SET);
37 |     if (!fileLength)
38 |     {
39 |         std::cout << pFileName << " has zero length\n";
40 |         fclose(fp);
41 |         return false;
42 |     }
43 | 
44 |     image.resize((size_t)fileLength);
45 |     fread(&image[0], 1, image.size(), fp);
46 |     fclose(fp);
47 |     return true;
48 | }


--------------------------------------------------------------------------------
/cpp/external/cupti_profilerhost_util/include/c_util/ScopeExit.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | template <typename T>
 4 | 
 5 | class ScopeExit
 6 | {
 7 | public:
 8 |     ScopeExit(T t) : t(t) {}
 9 |     ~ScopeExit() { t(); }
10 |     T t;
11 | };
12 | 
13 | template <typename T>
14 | ScopeExit<T> MoveScopeExit(T t) {
15 |     return ScopeExit<T>(t);
16 | };
17 | 
18 | #define NV_ANONYMOUS_VARIABLE_DIRECT(name, line) name##line
19 | #define NV_ANONYMOUS_VARIABLE_INDIRECT(name, line) NV_ANONYMOUS_VARIABLE_DIRECT(name, line)
20 | 
21 | #define SCOPE_EXIT(func) const auto NV_ANONYMOUS_VARIABLE_INDIRECT(EXIT, __LINE__) = MoveScopeExit([=](){func();})
22 | 


--------------------------------------------------------------------------------
/cpp/external/cupti_profilerhost_util/include/profilerhost_util/Eval.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | #include <vector>
 5 | #include <nvperf_host.h>
 6 | 
 7 | namespace NV {
 8 |     namespace Metric {
 9 |         namespace Eval {
10 |             struct MetricNameValue {
11 |                 std::string metricName;
12 |                 int numRanges;
13 |                 // <rangeName , metricValue> pair
14 |                 std::vector < std::pair<std::string, double> > rangeNameMetricValueMap;
15 |             };
16 | 
17 | 
18 |             /* Function to get aggregate metric value
19 |              * @param[in]  chipName                 Chip name for which to get metric values
20 |              * @param[in]  counterDataImage         Counter data image
21 |              * @param[in]  metricNames              List of metrics to read from counter data image
22 |              * @param[out] metricNameValueMap       Metric name value map
23 |              */
24 |             bool GetMetricGpuValue(NVPA_MetricsContext* metricsContext, std::string chipName, std::vector<uint8_t> counterDataImage, std::vector<std::string> metricNames, std::vector<MetricNameValue>& metricNameValueMap);
25 | 
26 |             bool PrintMetricValues(std::string chipName, std::vector<uint8_t> counterDataImage, std::vector<std::string> metricNames);
27 | 
28 |             }
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/cpp/external/cupti_profilerhost_util/include/profilerhost_util/List.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | namespace NV {
 3 |     namespace Metric {
 4 |         namespace Enum {
 5 |             // Function to print list of all supported chips
 6 |             bool ListSupportedChips();
 7 | 
 8 |             /* Function to print list of all metrics for a given chip
 9 |              * @param[in]  chipName         Chip Name for which metrics are to be listed
10 |              * @param[in]  listSubMetrics   Whether submetrics(Peak, PerCycle, PctOfPeak) are to be listed or not
11 |              */
12 |             bool ListMetrics(const char* chipName, bool listSubMetrics);
13 |         }
14 |     }
15 | }


--------------------------------------------------------------------------------
/cpp/external/cupti_profilerhost_util/include/profilerhost_util/Metric.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | #include <vector>
 5 | #include <nvperf_host.h>
 6 | 
 7 | namespace NV {
 8 |     namespace Metric {
 9 |         namespace Config {
10 |             /* Function to get Config image
11 |             * @param[in]  chipName            Chip name for which configImage is to be generated
12 |             * @param[in]  metricNames         List of metrics for which configImage is to be generated
13 |             * @param[out] configImage         Generated configImage
14 |             */
15 |             bool GetConfigImage(NVPA_MetricsContext* metricsContext, std::string chipName, std::vector<std::string> metricNames, std::vector<uint8_t>& configImage);
16 | 
17 |             /* Function to get CounterDataPrefix image
18 |             * @param[in]  chipName                  Chip name for which counterDataImagePrefix is to be generated
19 |             * @param[in]  metricNames               List of metrics for which counterDataImagePrefix is to be generated
20 |             * @param[out] counterDataImagePrefix    Generated counterDataImagePrefix
21 |             */
22 |             bool GetCounterDataPrefixImage(NVPA_MetricsContext* metricsContext, std::string chipName, std::vector<std::string> metricNames, std::vector<uint8_t>& counterDataImagePrefix);
23 |         }
24 |     }
25 | }
26 | 


--------------------------------------------------------------------------------
/cpp/external/cupti_profilerhost_util/include/profilerhost_util/Parser.h:
--------------------------------------------------------------------------------
 1 | #include <string>
 2 | 
 3 | namespace NV {
 4 |     namespace Metric {
 5 |         namespace Parser {
 6 |             inline bool ParseMetricNameString(const std::string& metricName, std::string* reqName, bool* isolated, bool* keepInstances)
 7 |             {
 8 |                 std::string& name = *reqName;
 9 |                 name = metricName;
10 |                 if (name.empty())
11 |                 {
12 |                     return false;
13 |                 }
14 | 
15 |                 // boost program_options sometimes inserts a \n between the metric name and a '&' at the end
16 |                 size_t pos = name.find('\n');
17 |                 if (pos != std::string::npos)
18 |                 {
19 |                     name.erase(pos, 1);
20 |                 }
21 | 
22 |                 // trim whitespace
23 |                 while (name.back() == ' ')
24 |                 {
25 |                     name.pop_back();
26 |                     if (name.empty())
27 |                     {
28 |                         return false;
29 |                     }
30 |                 }
31 | 
32 |                 *keepInstances = false;
33 |                 if (name.back() == '+')
34 |                 {
35 |                     *keepInstances = true;
36 |                     name.pop_back();
37 |                     if (name.empty())
38 |                     {
39 |                         return false;
40 |                     }
41 |                 }
42 | 
43 |                 *isolated = true;
44 |                 if (name.back() == '$')
45 |                 {
46 |                     name.pop_back();
47 |                     if (name.empty())
48 |                     {
49 |                         return false;
50 |                     }
51 |                 }
52 |                 else if (name.back() == '&')
53 |                 {
54 |                     *isolated = false;
55 |                     name.pop_back();
56 |                     if (name.empty())
57 |                     {
58 |                         return false;
59 |                     }
60 |                 }
61 | 
62 |                 return true;
63 |             }
64 |         }
65 |     }
66 | }


--------------------------------------------------------------------------------
/cpp/external/cupti_profilerhost_util/src/profilerhost_util/List.cpp:
--------------------------------------------------------------------------------
 1 | #include <List.h>
 2 | #include <iostream>
 3 | #include <nvperf_host.h>
 4 | #include <nvperf_cuda_host.h>
 5 | #include <ScopeExit.h>
 6 | 
 7 | #define RETURN_IF_NVPW_ERROR(retval, actual) \
 8 |     do { \
 9 |         if (NVPA_STATUS_SUCCESS != actual) { \
10 |             fprintf(stderr, "FAILED: %s\n", #actual); \
11 |             return retval; \
12 |         } \
13 |     } while (0)
14 | 
15 | namespace NV {
16 |     namespace Metric {
17 |         namespace Enum {
18 |             bool ListSupportedChips() {
19 |                 NVPW_GetSupportedChipNames_Params getSupportedChipNames = { NVPW_GetSupportedChipNames_Params_STRUCT_SIZE };
20 |                 RETURN_IF_NVPW_ERROR(false, NVPW_GetSupportedChipNames(&getSupportedChipNames));
21 |                 std::cout << "\n Number of supported chips : " << getSupportedChipNames.numChipNames;
22 |                 std::cout << "\n List of supported chips : \n";
23 | 
24 |                 for (size_t i = 0; i < getSupportedChipNames.numChipNames; i++) {
25 |                     std::cout << " " << getSupportedChipNames.ppChipNames[i] << "\n";
26 |                 }
27 | 
28 |                 return true;
29 |             }
30 | 
31 |             bool ListMetrics(const char* chip, bool listSubMetrics) {
32 | 
33 |                 NVPW_CUDA_MetricsContext_Create_Params metricsContextCreateParams = { NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE };
34 |                 metricsContextCreateParams.pChipName = chip;
35 |                 RETURN_IF_NVPW_ERROR(false, NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams));
36 | 
37 |                 NVPW_MetricsContext_Destroy_Params metricsContextDestroyParams = { NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE };
38 |                 metricsContextDestroyParams.pMetricsContext = metricsContextCreateParams.pMetricsContext;
39 |                 SCOPE_EXIT([&]() { NVPW_MetricsContext_Destroy((NVPW_MetricsContext_Destroy_Params *)&metricsContextDestroyParams); });
40 | 
41 |                 NVPW_MetricsContext_GetMetricNames_Begin_Params getMetricNameBeginParams = { NVPW_MetricsContext_GetMetricNames_Begin_Params_STRUCT_SIZE };
42 |                 getMetricNameBeginParams.pMetricsContext = metricsContextCreateParams.pMetricsContext;
43 |                 getMetricNameBeginParams.hidePeakSubMetrics = !listSubMetrics;
44 |                 getMetricNameBeginParams.hidePerCycleSubMetrics = !listSubMetrics;
45 |                 getMetricNameBeginParams.hidePctOfPeakSubMetrics = !listSubMetrics;
46 |                 RETURN_IF_NVPW_ERROR(false, NVPW_MetricsContext_GetMetricNames_Begin(&getMetricNameBeginParams));
47 | 
48 |                 NVPW_MetricsContext_GetMetricNames_End_Params getMetricNameEndParams = { NVPW_MetricsContext_GetMetricNames_End_Params_STRUCT_SIZE };
49 |                 getMetricNameEndParams.pMetricsContext = metricsContextCreateParams.pMetricsContext;
50 |                 SCOPE_EXIT([&]() { NVPW_MetricsContext_GetMetricNames_End((NVPW_MetricsContext_GetMetricNames_End_Params *)&getMetricNameEndParams); });
51 |                 
52 |                 std::cout << getMetricNameBeginParams.numMetrics << " metrics in total on the chip\n Metrics List : \n";
53 |                 for (size_t i = 0; i < getMetricNameBeginParams.numMetrics; i++) {
54 |                     std::cout << getMetricNameBeginParams.ppMetricNames[i] << "\n";
55 |                 }
56 | 
57 |                 return true;
58 |             }
59 |         }
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/cpp/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(cuda)
2 | add_subdirectory(frontend)
3 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # This library encapsulates Habitat's bindings to CUDA related utilities, such as CUPTI and the standalone
 2 | # occupancy calculator. It should not be dependent on other Habitat libraries.
 3 | add_library(habitat-cuda-lib
 4 |   diagnostics.cu
 5 |   cupti_exceptions.cpp
 6 |   cupti_manager.cpp
 7 |   cupti_profiler.cpp
 8 |   cupti_tracer.cpp
 9 |   kernel.cpp
10 |   legacy_cupti_profiler.cpp
11 |   new_cupti_profiler.cpp
12 | )
13 | 
14 | target_link_libraries(habitat-cuda-lib
15 |   PRIVATE
16 |     CUDA::cupti
17 |     CUDA::cudart
18 |     CUDA::cuda_driver
19 |     cupti_profilerhost_util
20 |     ${NVPerf_LIBRARIES}
21 | )
22 | 
23 | target_include_directories(habitat-cuda-lib
24 |   PRIVATE
25 |     ${CUPTI_INCLUDE_DIRS}
26 | )
27 | 
28 | target_compile_options(
29 |   habitat-cuda-lib
30 |   PRIVATE
31 |     $<$<COMPILE_LANGUAGE:CUDA>:-use_fast_math>
32 |     "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-gencode arch=compute_60,code=sm_60>"
33 |     "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-gencode arch=compute_61,code=sm_61>"
34 |     "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-gencode arch=compute_70,code=sm_70>"
35 |     "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:-gencode arch=compute_75,code=sm_75>"
36 | )
37 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/cuda_macros.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cuda.h>
 4 | #include <stdexcept>
 5 | 
 6 | // This header should only be included in source files (i.e. .cpp files) that contain CUDA API calls.
 7 | 
 8 | #define RUNTIME_API_CALL(apiFuncCall)                                          \
 9 | do {                                                                           \
10 |     cudaError_t _status = apiFuncCall;                                         \
11 |     if (_status != cudaSuccess) {                                              \
12 |         fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n",   \
13 |                 __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\
14 |         throw std::runtime_error("CUDA Runtime API call failed.");             \
15 |     }                                                                          \
16 | } while (0)
17 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/cupti_exceptions.cpp:
--------------------------------------------------------------------------------
 1 | #include "cupti_exceptions.h"
 2 | 
 3 | #include <string>
 4 | 
 5 | namespace habitat {
 6 | namespace cuda {
 7 | 
 8 | CuptiError::CuptiError(CUptiResult error_code, const char* error_message)
 9 |   : std::runtime_error(std::string(error_message)),
10 |     error_code_(error_code) {}
11 | 
12 | CuptiError CuptiError::from(CUptiResult error_code) {
13 |   const char* message;
14 |   cuptiGetResultString(error_code, &message);
15 |   return CuptiError(error_code, message);
16 | }
17 | 
18 | }
19 | }
20 | 
21 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/cupti_exceptions.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdexcept>
 4 | #include <cupti.h>
 5 | 
 6 | namespace habitat {
 7 | namespace cuda {
 8 | 
 9 | class CuptiError : public std::runtime_error {
10 |  public:
11 |   static CuptiError from(CUptiResult error_code);
12 | 
13 |   CUptiResult errorCode() const {
14 |     return error_code_;
15 |   }
16 | 
17 |  private:
18 |   CuptiError(CUptiResult error_code, const char* error_message);
19 |   CUptiResult error_code_;
20 | };
21 | 
22 | }
23 | }
24 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/cupti_macros.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cupti.h>
 4 | #include <cstdio>
 5 | #include <stdexcept>
 6 | 
 7 | #include "cupti_exceptions.h"
 8 | 
 9 | // This header should only be included in source files (i.e. .cpp files) that contain CUPTI API calls.
10 | 
11 | #define CUPTI_CALL(call)                                                 \
12 | do {                                                                     \
13 |   CUptiResult _status = call;                                            \
14 |   if (_status != CUPTI_SUCCESS) {                                        \
15 |     const char* message;                                                 \
16 |     cuptiGetResultString(_status, &message);                             \
17 |     fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
18 |             __FILE__, __LINE__, #call, message);                         \
19 |     throw habitat::cuda::CuptiError::from(_status);                      \
20 |   }                                                                      \
21 | } while (0)
22 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/cupti_manager.cpp:
--------------------------------------------------------------------------------
 1 | #include "habitat_cupti.h"
 2 | 
 3 | #include <cuda.h>
 4 | #include <cupti.h>
 5 | #include <stdlib.h>
 6 | #include <stdio.h>
 7 | #include <algorithm>
 8 | #include <utility>
 9 | 
10 | #include "cupti_profiler.h"
11 | 
12 | namespace habitat {
13 | namespace cuda {
14 | 
15 | CuptiManager::CuptiManager()
16 |   : profiler_(CuptiProfiler::create()),
17 |     callbacks_bound_(false),
18 |     should_cache_metrics_(false) {}
19 | 
20 | CuptiManager& CuptiManager::instance() {
21 |   static std::unique_ptr<CuptiManager> manager(new CuptiManager());
22 |   return *manager;
23 | }
24 | 
25 | // CuptiManager::allocateTracer() is defined in cupti_tracer.cpp
26 | 
27 | void CuptiManager::unloadCupti() {
28 |   if (tracers_.size() > 0) {
29 |     throw std::runtime_error("Cannot unload CUPTI because at least one tracer is still bound.");
30 |   }
31 |   cudaDeviceSynchronize();
32 |   cuptiActivityFlushAll(0);
33 |   cuptiFinalize();
34 |   callbacks_bound_ = false;
35 | }
36 | 
37 | void CuptiManager::newKernelInstance(KernelInstance info) {
38 |   for (auto& tracer : tracers_) {
39 |     tracer->kernels_.push_back(info);
40 |   }
41 | }
42 | 
43 | void CuptiManager::measureMetric(
44 |     const std::string& metric_name,
45 |     std::function<void(void)> runnable,
46 |     std::vector<KernelInstance>& kernels) {
47 |   if (tracers_.size() > 0) {
48 |     throw std::runtime_error(
49 |         "A CuptiTracer instance still exists. Metrics cannot be measured when tracing is being performed.");
50 |   }
51 | 
52 |   // If the cache can fulfil the metrics request, just use the cache
53 |   if (should_cache_metrics_ &&
54 |     std::all_of(kernels.cbegin(), kernels.cend(), [&](auto& kernel) {
55 |       auto it = metrics_cache_.find(kernel.metadata());
56 |       return it != metrics_cache_.end() && it->second.count(metric_name) > 0;
57 |     })) {
58 | 
59 |     for (auto& kernel : kernels) {
60 |       auto it = metrics_cache_.find(kernel.metadata());
61 |       double value = it->second.at(metric_name);
62 |       kernel.addMetric(metric_name, value);
63 |     }
64 |     return;
65 |   }
66 | 
67 |   // Otherwise, run the profiler
68 |   profiler_->profile(metric_name, runnable, kernels);
69 | 
70 |   // Update the cache if needed
71 |   if (should_cache_metrics_) {
72 |     for (auto& kernel : kernels) {
73 |       auto& metrics_map = metrics_cache_[kernel.metadata()];
74 |       for (const auto& metric_pair : kernel.metrics()) {
75 |         // NOTE: Map insert/emplace will only do the actual insertion if the key has not been used.
76 |         //       Otherwise it just returns an iterator to the existing value.
77 |         metrics_map.emplace(metric_pair);
78 |       }
79 |     }
80 |   }
81 | }
82 | 
83 | void CuptiManager::setCacheMetrics(bool should_cache) {
84 |   if (should_cache_metrics_ && !should_cache) {
85 |     metrics_cache_.clear();
86 |   }
87 |   should_cache_metrics_ = should_cache;
88 | }
89 | 
90 | bool CuptiManager::isCachingMetrics() const {
91 |   return should_cache_metrics_;
92 | }
93 | 
94 | }
95 | }
96 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/cupti_profiler.cpp:
--------------------------------------------------------------------------------
 1 | #include "cupti_profiler.h"
 2 | 
 3 | #include <cuda.h>
 4 | #include <cuda_runtime_api.h>
 5 | #include <memory>
 6 | 
 7 | #include "legacy_cupti_profiler.h"
 8 | #include "new_cupti_profiler.h"
 9 | #include "cuda_macros.h"
10 | 
11 | namespace habitat {
12 | namespace cuda {
13 | 
14 | std::unique_ptr<CuptiProfiler> CuptiProfiler::create() {
15 |   cudaDeviceProp properties;
16 |   RUNTIME_API_CALL(cudaGetDeviceProperties(&properties, 0));
17 |   if (properties.major >= 7) {
18 |     // The new profiler is used for Volta and newer GPUs
19 |     return std::unique_ptr<CuptiProfiler>(new NewCuptiProfiler());
20 |   } else {
21 |     return std::unique_ptr<CuptiProfiler>(new LegacyCuptiProfiler());
22 |   }
23 | }
24 | 
25 | void CuptiProfiler::addMetrics(
26 |     std::vector<KernelInstance>& kernels, const std::vector<KernelMetric>& metrics, const std::string& metric_name) {
27 |   // Right now our "profiling model" is that only one metric is measured. NVIDIA's CUPTI documentation is
28 |   // unfortunately not that clear, so we don't know exactly how profiled "ranges" map to kernels. Right now
29 |   // we assume that the order of metric values matches the order in which the kernels were launched.
30 |   if (kernels.size() != metrics.size()) {
31 |     // Not sure how to proceed - we should throw an exception to be safe.
32 |     throw std::runtime_error("Encountered a KernelInstance and metrics vector size mismatch!");
33 |   }
34 | 
35 |   for (size_t i = 0; i < kernels.size(); i++) {
36 |     kernels.at(i).addMetric(metric_name, metrics.at(i).metricValue());
37 |   }
38 | }
39 | 
40 | }
41 | }
42 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/cupti_profiler.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <functional>
 4 | #include <memory>
 5 | #include <string>
 6 | #include <vector>
 7 | #include "kernel.h"
 8 | #include "metrics.h"
 9 | 
10 | namespace habitat {
11 | namespace cuda {
12 | 
13 | class CuptiManager;
14 | 
15 | class CuptiProfiler {
16 |  public:
17 |   virtual ~CuptiProfiler() {}
18 | 
19 |   /**
20 |    * Perform the profiling to measure the requested metric.
21 |    */
22 |   virtual void profile(
23 |       const std::string& metric_name,
24 |       std::function<void(void)> runnable,
25 |       std::vector<KernelInstance>& kernels) const = 0;
26 | 
27 |  protected:
28 |   CuptiProfiler() {}
29 | 
30 |   /**
31 |    * Utility function used to add measured kernel metrics to their associated KernelInstances.
32 |    */
33 |   static void addMetrics(
34 |       std::vector<KernelInstance>& kernels,
35 |       const std::vector<KernelMetric>& metrics,
36 |       const std::string& metric_name);
37 | 
38 |  private:
39 |   static std::unique_ptr<CuptiProfiler> create();
40 |   friend class CuptiManager;
41 | };
42 | 
43 | }
44 | }
45 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/cupti_tracer.cpp:
--------------------------------------------------------------------------------
  1 | #include "habitat_cupti.h"
  2 | 
  3 | #include <algorithm>
  4 | #include <cstdlib>
  5 | #include <iostream>
  6 | #include <stdexcept>
  7 | 
  8 | #include <cupti.h>
  9 | #include <cuda_runtime.h>
 10 | 
 11 | #include "cupti_macros.h"
 12 | 
 13 | #define BUF_SIZE (32 * 1024)
 14 | #define ALIGN_SIZE (8)
 15 | #define ALIGN_BUFFER(buffer, align)                                            \
 16 |   (((uintptr_t) (buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t) (buffer) & ((align)-1))) : (buffer))
 17 | 
 18 | namespace {
 19 | 
 20 | void handleCuptiActivity(CUpti_Activity* record) {
 21 |   if (record->kind != CUPTI_ACTIVITY_KIND_KERNEL && record->kind != CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) {
 22 |     return;
 23 |   }
 24 |   CUpti_ActivityKernel4* kernel = (CUpti_ActivityKernel4 *) record;
 25 |   habitat::cuda::KernelInstance instance(
 26 |     habitat::cuda::KernelMetadata(
 27 |       std::string(kernel->name),
 28 |       kernel->gridX * kernel->gridY * kernel->gridZ,
 29 |       kernel->blockX * kernel->blockY * kernel->blockZ,
 30 |       kernel->dynamicSharedMemory,
 31 |       kernel->staticSharedMemory,
 32 |       kernel->registersPerThread),
 33 |     kernel->end - kernel->start);
 34 |   habitat::cuda::CuptiManager::instance().newKernelInstance(std::move(instance));
 35 | }
 36 | 
 37 | void cuptiBufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords) {
 38 |   uint8_t *bfr = (uint8_t *) malloc(BUF_SIZE + ALIGN_SIZE);
 39 |   if (bfr == NULL) {
 40 |     std::cerr << "ERROR: Out of memory! (malloc in CUPTI)" << std::endl;
 41 |     exit(-1);
 42 |   }
 43 | 
 44 |   *size = BUF_SIZE;
 45 |   *buffer = ALIGN_BUFFER(bfr, ALIGN_SIZE);
 46 |   *maxNumRecords = 0;
 47 | }
 48 | 
 49 | void cuptiBufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize) {
 50 |   CUptiResult status;
 51 |   CUpti_Activity *record = NULL;
 52 | 
 53 |   if (validSize > 0) {
 54 |     do {
 55 |       status = cuptiActivityGetNextRecord(buffer, validSize, &record);
 56 |       if (status == CUPTI_SUCCESS) {
 57 |         handleCuptiActivity(record);
 58 | 
 59 |       } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
 60 |         break;
 61 | 
 62 |       } else {
 63 |         CUPTI_CALL(status);
 64 |       }
 65 |     } while (1);
 66 | 
 67 |     // report any records dropped from the queue
 68 |     size_t dropped;
 69 |     CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
 70 |     if (dropped != 0) {
 71 |       std::cerr << "WARNING: CUPTI dropped " << dropped << " activity records." << std::endl;
 72 |     }
 73 |   }
 74 | 
 75 |   free(buffer);
 76 | }
 77 | 
 78 | inline void enableCuptiRecording() {
 79 |   CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
 80 | }
 81 | 
 82 | inline void flushCupti() {
 83 |   cudaDeviceSynchronize();
 84 |   cuptiActivityFlushAll(0);
 85 | }
 86 | 
 87 | inline void disableCuptiRecording() {
 88 |   flushCupti();
 89 |   CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL));
 90 | }
 91 | 
 92 | }
 93 | 
 94 | namespace habitat {
 95 | namespace cuda {
 96 | 
 97 | CuptiTracer::CuptiTracer() {}
 98 | 
 99 | std::vector<KernelInstance>&& CuptiTracer::kernels() && {
100 |   flushCupti();
101 |   return std::move(kernels_);
102 | }
103 | 
104 | std::vector<KernelInstance> CuptiTracer::lastKernels(size_t num_iterations) const {
105 |   flushCupti();
106 |   // NOTE: We assume that, after this point, no more kernels will be appended to the kernels_ vector.
107 |   if (kernels_.size() % num_iterations != 0) {
108 |     throw std::runtime_error("Recorded kernel size mismatch!");
109 |   }
110 | 
111 |   size_t num_kernels = kernels_.size() / num_iterations;
112 |   std::vector<KernelInstance> results;
113 |   results.reserve(num_kernels);
114 |   results.insert(results.begin(), kernels_.end() - num_kernels, kernels_.end());
115 |   return results;
116 | }
117 | 
118 | CuptiTracer::Ptr CuptiManager::allocateTracer() {
119 |   if (!callbacks_bound_) {
120 |     CUPTI_CALL(cuptiActivityRegisterCallbacks(cuptiBufferRequested, cuptiBufferCompleted));
121 |     callbacks_bound_ = true;
122 |   }
123 |   if (tracers_.size() == 0) {
124 |     enableCuptiRecording();
125 |   }
126 |   CuptiTracer::Ptr ptr(new CuptiTracer(), &detail::cuptiTracerDeleter);
127 |   tracers_.push_back(ptr.get());
128 |   return ptr;
129 | }
130 | 
131 | namespace detail {
132 | 
133 | void cuptiTracerDeleter(CuptiTracer* tracer) {
134 |   CuptiManager& manager = CuptiManager::instance();
135 |   auto it = std::find(manager.tracers_.begin(), manager.tracers_.end(), tracer);
136 |   if (it == manager.tracers_.end()) {
137 |     // Assertion failure
138 |     throw std::runtime_error("Did not find CUPTI tracer in the manager's list when deleting!");
139 |   }
140 |   manager.tracers_.erase(it);
141 |   if (manager.tracers_.size() == 0) {
142 |     disableCuptiRecording();
143 |   }
144 |   delete tracer;
145 | }
146 | 
147 | }
148 | }
149 | }
150 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/diagnostics.cu:
--------------------------------------------------------------------------------
 1 | #define SIZE 10000
 2 | 
 3 | namespace {
 4 | 
 5 | // We need this global variable to ensure that nvcc does not optimize away the
 6 | // operations inside flop_test().
 7 | __device__ float accum = 0.;
 8 | 
 9 | __global__ void flop_test() {
10 |   float a = 0.1;
11 | #pragma unroll
12 |   for (size_t i = 0; i < SIZE; i++) {
13 |     accum += a;
14 |   }
15 | }
16 | 
17 | }
18 | 
19 | namespace habitat {
20 | namespace cuda {
21 | namespace diagnostics {
22 | 
23 | void run_flop_test(size_t num_blocks, size_t threads_per_block) {
24 |   flop_test<<<num_blocks, threads_per_block>>>();
25 | }
26 | 
27 | }
28 | }
29 | }
30 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/diagnostics.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace habitat {
 4 | namespace cuda {
 5 | namespace diagnostics {
 6 | 
 7 | /**
 8 |  * Launches a single kernel that repeatedly performs 32-bit floating point adds.
 9 |  *
10 |  * This diagnostic kernel is used to help us determine the peak performance
11 |  * (GFLOP/s) of a device.
12 |  */
13 | void run_flop_test(size_t num_blocks = 8192, size_t threads_per_block = 256);
14 | 
15 | }
16 | }
17 | }
18 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/habitat_cupti.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <functional>
  4 | #include <memory>
  5 | #include <string>
  6 | #include <vector>
  7 | #include <unordered_map>
  8 | 
  9 | #include "kernel.h"
 10 | #include "metrics.h"
 11 | #include "cupti_profiler.h"
 12 | 
 13 | namespace habitat {
 14 | namespace cuda {
 15 | 
 16 | class CuptiManager;
 17 | class CuptiTracer;
 18 | 
 19 | namespace detail {
 20 | 
 21 | void cuptiTracerDeleter(CuptiTracer* tracer);
 22 | 
 23 | }
 24 | 
 25 | /**
 26 |  * Accumulates kernel invocations during its lifetime.
 27 |  */
 28 | class CuptiTracer {
 29 |  public:
 30 |   using Ptr = std::unique_ptr<CuptiTracer, decltype(&detail::cuptiTracerDeleter)>;
 31 | 
 32 |   CuptiTracer(const CuptiTracer&) = delete;
 33 |   CuptiTracer& operator=(const CuptiTracer&) = delete;
 34 | 
 35 |   /**
 36 |    * Grab all the recorded kernels.
 37 |    */
 38 |   std::vector<KernelInstance>&& kernels() &&;
 39 | 
 40 |   /**
 41 |    * Assuming the same operation was recorded multiple times, this returns the kernels in the last operation.
 42 |    */
 43 |   std::vector<KernelInstance> lastKernels(size_t num_iterations) const;
 44 | 
 45 |  private:
 46 |   CuptiTracer();
 47 |   std::vector<KernelInstance> kernels_;
 48 | 
 49 |   friend class CuptiManager;
 50 | };
 51 | 
 52 | /**
 53 |  * Singleton that manages bindings to CUPTI.
 54 |  */
 55 | class CuptiManager {
 56 |  public:
 57 |   static CuptiManager& instance();
 58 |   CuptiTracer::Ptr allocateTracer();
 59 | 
 60 |   /**
 61 |    * Measures the specified metric for the kernels invoked by a given runnable.
 62 |    *
 63 |    * The metrics will be appended to the respective KernelInstances passed in to
 64 |    * this method.
 65 |    *
 66 |    * NOTE: The kernels invoked by the runnable must already exist as KernelInstances
 67 |    *       (i.e. the tracer must be used first). This is because some of the CUPTI
 68 |    *       metrics APIs require the execution time of the kernels.
 69 |    */
 70 |   void measureMetric(
 71 |       const std::string& metric_name,
 72 |       std::function<void(void)> runnable,
 73 |       std::vector<KernelInstance>& kernels);
 74 | 
 75 |   /**
 76 |    * Ensures CUPTI is unloaded from the process.
 77 |    *
 78 |    * This throws an exception if there are still CuptiTracers bound to the manager.
 79 |    */
 80 |   void unloadCupti();
 81 | 
 82 |   /**
 83 |    * This method is NOT intended to be called directly by end users.
 84 |    *
 85 |    * This method is called by CUPTI when recording a trace of the kernels that have been executed.
 86 |    */
 87 |   void newKernelInstance(KernelInstance info);
 88 | 
 89 |   /**
 90 |    * Use this method to control whether or not the manager should cache kernel metrics.
 91 |    * This can be useful because metrics gathering is very slow on Volta and newer generations.
 92 |    *
 93 |    * By default caching is disabled.
 94 |    */
 95 |   void setCacheMetrics(bool should_cache);
 96 | 
 97 |   /**
 98 |    * Returns whether or not the manager is caching kernel metrics.
 99 |    */
100 |   bool isCachingMetrics() const;
101 | 
102 |   CuptiManager(const CuptiManager&) = delete;
103 |   CuptiManager& operator=(const CuptiManager&) = delete;
104 | 
105 |  private:
106 |   CuptiManager();
107 |   std::unique_ptr<CuptiProfiler> profiler_;
108 |   std::vector<CuptiTracer*> tracers_;
109 |   std::unordered_map<KernelMetadata, std::unordered_map<std::string, double>> metrics_cache_;
110 |   bool callbacks_bound_;
111 |   bool should_cache_metrics_;
112 | 
113 |   friend void detail::cuptiTracerDeleter(CuptiTracer *tracer);
114 | };
115 | 
116 | /**
117 |  * RAII-helper to ensure CUPTI is unloaded.
118 |  */
119 | class CuptiGuard {
120 |  public:
121 |   ~CuptiGuard() {
122 |     CuptiManager::instance().unloadCupti();
123 |   }
124 | };
125 | 
126 | }
127 | }
128 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/kernel.cpp:
--------------------------------------------------------------------------------
 1 | #include "kernel.h"
 2 | 
 3 | namespace {
 4 | 
 5 | // We don't want any symbols from this header to be visible outside of this file
 6 | #include "cuda_occupancy.h"
 7 | 
 8 | cudaOccDeviceProp occDeviceProps(const habitat::cuda::DeviceProperties& properties) {
 9 |   cudaOccDeviceProp device_properties;
10 |   device_properties.computeMajor = properties.compute_major;
11 |   device_properties.computeMinor = properties.compute_minor;
12 |   device_properties.maxThreadsPerBlock = properties.max_threads_per_block;
13 |   device_properties.maxThreadsPerMultiprocessor = properties.max_threads_per_multiprocessor;
14 |   device_properties.regsPerBlock = properties.regs_per_block;
15 |   device_properties.regsPerMultiprocessor = properties.regs_per_multiprocessor;
16 |   device_properties.warpSize = properties.warp_size;
17 |   device_properties.sharedMemPerBlock = properties.shared_mem_per_block;
18 |   device_properties.sharedMemPerMultiprocessor = properties.shared_mem_per_multiprocessor;
19 |   device_properties.numSms = properties.num_sms;
20 |   device_properties.sharedMemPerBlockOptin = properties.shared_mem_per_block_optin;
21 |   return device_properties;
22 | }
23 | 
24 | }
25 | 
26 | namespace habitat {
27 | namespace cuda {
28 | 
29 | uint32_t KernelMetadata::threadBlockOccupancy(const DeviceProperties& device) const {
30 |   return threadBlockOccupancy(device, registers_per_thread_);
31 | }
32 | 
33 | uint32_t KernelMetadata::threadBlockOccupancy(
34 |     const DeviceProperties& device, uint16_t registers_per_thread) const {
35 |   cudaOccDeviceProp device_properties(occDeviceProps(device));
36 |   cudaOccDeviceState device_state;
37 |   cudaOccFuncAttributes attributes;
38 |   attributes.maxThreadsPerBlock = INT_MAX;
39 |   attributes.maxDynamicSharedSizeBytes = INT_MAX;
40 |   attributes.numRegs = registers_per_thread;
41 |   attributes.sharedSizeBytes = static_shared_memory_;
42 | 
43 |   int res;
44 |   cudaOccResult result;
45 |   if ((res = cudaOccMaxActiveBlocksPerMultiprocessor(
46 |         &result,
47 |         &device_properties,
48 |         &attributes,
49 |         &device_state,
50 |         block_size_,
51 |         dynamic_shared_memory_)) != CUDA_OCC_SUCCESS) {
52 |     return 0;
53 |   }
54 | 
55 |   return result.activeBlocksPerMultiprocessor;
56 | }
57 | 
58 | bool operator==(const KernelMetadata& lhs, const KernelMetadata& rhs) {
59 |   return lhs.num_blocks_ == rhs.num_blocks_ &&
60 |     lhs.block_size_ == rhs.block_size_ &&
61 |     lhs.dynamic_shared_memory_ == rhs.dynamic_shared_memory_ &&
62 |     lhs.static_shared_memory_ == rhs.static_shared_memory_ &&
63 |     lhs.registers_per_thread_ == rhs.registers_per_thread_ &&
64 |     lhs.name_ == rhs.name_;
65 | }
66 | 
67 | void KernelInstance::addMetric(std::string name, double value) {
68 |   metrics_.push_back(std::make_pair(std::move(name), value));
69 | }
70 | 
71 | }
72 | }
73 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/kernel.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <string>
  4 | #include <cstdint>
  5 | #include <utility>
  6 | #include <vector>
  7 | 
  8 | #include "sampled_measurement.h"
  9 | #include "utils.h"
 10 | 
 11 | namespace habitat {
 12 | namespace cuda {
 13 | 
 14 | struct DeviceProperties {
 15 |   DeviceProperties(
 16 |       std::string name,
 17 |       int compute_major,
 18 |       int compute_minor,
 19 |       int max_threads_per_block,
 20 |       int max_threads_per_multiprocessor,
 21 |       int regs_per_block,
 22 |       int regs_per_multiprocessor,
 23 |       int warp_size,
 24 |       size_t shared_mem_per_block,
 25 |       size_t shared_mem_per_multiprocessor,
 26 |       int num_sms,
 27 |       size_t shared_mem_per_block_optin,
 28 |       int mem_bandwidth_gb,
 29 |       size_t base_clock_mhz,
 30 |       size_t peak_gflops_per_second)
 31 |         : name(std::move(name)),
 32 |           compute_major(compute_major),
 33 |           compute_minor(compute_minor),
 34 |           max_threads_per_block(max_threads_per_block),
 35 |           max_threads_per_multiprocessor(max_threads_per_multiprocessor),
 36 |           regs_per_block(regs_per_block),
 37 |           regs_per_multiprocessor(regs_per_multiprocessor),
 38 |           warp_size(warp_size),
 39 |           shared_mem_per_block(shared_mem_per_block),
 40 |           shared_mem_per_multiprocessor(shared_mem_per_multiprocessor),
 41 |           num_sms(num_sms),
 42 |           shared_mem_per_block_optin(shared_mem_per_block_optin),
 43 |           mem_bandwidth_gb(mem_bandwidth_gb),
 44 |           base_clock_mhz(base_clock_mhz),
 45 |           peak_gflops_per_second(peak_gflops_per_second) {}
 46 | 
 47 |   std::string name;
 48 |   int compute_major;
 49 |   int compute_minor;
 50 |   int max_threads_per_block;
 51 |   int max_threads_per_multiprocessor;
 52 |   int regs_per_block;
 53 |   int regs_per_multiprocessor;
 54 |   int warp_size;
 55 |   size_t shared_mem_per_block;
 56 |   size_t shared_mem_per_multiprocessor;
 57 |   int num_sms;
 58 |   size_t shared_mem_per_block_optin;
 59 |   int mem_bandwidth_gb;
 60 |   size_t base_clock_mhz;
 61 |   size_t peak_gflops_per_second;
 62 | };
 63 | 
 64 | class KernelMetadata {
 65 |  public:
 66 |   KernelMetadata(
 67 |       std::string name,
 68 |       int32_t num_blocks,
 69 |       int32_t block_size,
 70 |       int32_t dynamic_shared_memory,
 71 |       int32_t static_shared_memory,
 72 |       uint16_t registers_per_thread)
 73 |         : name_(std::move(name)),
 74 |           num_blocks_(num_blocks),
 75 |           block_size_(block_size),
 76 |           dynamic_shared_memory_(dynamic_shared_memory),
 77 |           static_shared_memory_(static_shared_memory),
 78 |           registers_per_thread_(registers_per_thread) {}
 79 | 
 80 |   const std::string& name() const {
 81 |     return name_;
 82 |   }
 83 | 
 84 |   int32_t numBlocks() const {
 85 |     return num_blocks_;
 86 |   }
 87 | 
 88 |   int32_t blockSize() const {
 89 |     return block_size_;
 90 |   }
 91 | 
 92 |   int32_t dynamicSharedMemory() const {
 93 |     return dynamic_shared_memory_;
 94 |   }
 95 | 
 96 |   int32_t staticSharedMemory() const {
 97 |     return static_shared_memory_;
 98 |   }
 99 | 
100 |   uint16_t registersPerThread() const {
101 |     return registers_per_thread_;
102 |   }
103 | 
104 |   /**
105 |    * Returns the theoretical thread block occupancy for this kernel when running on a
106 |    * specified GPU. The return value is zero if an error occurred.
107 |    */
108 |   uint32_t threadBlockOccupancy(const DeviceProperties& device, uint16_t registers_per_thread) const;
109 | 
110 |   /**
111 |    * Returns the theoretical thread block occupancy for this kernel when running on a specified
112 |    * GPU using the same number of registers as the kernel on the measured device. The return
113 |    * value is zero if an error occurred.
114 |    */
115 |   uint32_t threadBlockOccupancy(const DeviceProperties& device) const;
116 | 
117 |   friend bool operator==(const KernelMetadata& lhs, const KernelMetadata& rhs);
118 | 
119 |  private:
120 |   std::string name_;
121 |   int32_t num_blocks_;
122 |   int32_t block_size_;
123 |   int32_t dynamic_shared_memory_;
124 |   int32_t static_shared_memory_;
125 |   uint16_t registers_per_thread_;
126 | };
127 | 
128 | class KernelInstance {
129 |  public:
130 |   KernelInstance(KernelMetadata metadata, uint64_t run_time_ns)
131 |     : metadata_(std::move(metadata)), run_time_ns_(run_time_ns) {}
132 | 
133 |   const KernelMetadata& metadata() const {
134 |     return metadata_;
135 |   }
136 | 
137 |   uint64_t runTimeNs() const {
138 |     return run_time_ns_;
139 |   }
140 | 
141 |   void addMetric(std::string name, double value);
142 | 
143 |   const std::vector<std::pair<std::string, double>>& metrics() const {
144 |     return metrics_;
145 |   }
146 | 
147 |  private:
148 |   KernelMetadata metadata_;
149 |   uint64_t run_time_ns_;
150 |   std::vector<std::pair<std::string, double>> metrics_;
151 | };
152 | 
153 | }
154 | }
155 | 
156 | namespace std {
157 | 
158 | // Allow KernelMetadata to serve as a key in std::unordered_map
159 | template <>
160 | struct hash<habitat::cuda::KernelMetadata> {
161 |   std::size_t operator()(const habitat::cuda::KernelMetadata& m) const {
162 |     return habitat::utils::hash_combine(
163 |         m.name(),
164 |         m.numBlocks(),
165 |         m.blockSize(),
166 |         m.dynamicSharedMemory(),
167 |         m.staticSharedMemory(),
168 |         m.registersPerThread());
169 |   }
170 | };
171 | 
172 | }
173 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/legacy_cupti_profiler.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <functional>
 4 | #include <memory>
 5 | #include <vector>
 6 | 
 7 | #include "cupti_profiler.h"
 8 | #include "kernel.h"
 9 | 
10 | namespace habitat {
11 | namespace cuda {
12 | 
13 | /**
14 |  * Uses the legacy CUPTI metrics APIs (Pascal and older GPUs).
15 |  */
16 | class LegacyCuptiProfiler : public CuptiProfiler {
17 |  public:
18 |   void profile(
19 |       const std::string& metric_name,
20 |       std::function<void(void)> runnable,
21 |       std::vector<KernelInstance>& kernels) const override;
22 | 
23 |  private:
24 |   LegacyCuptiProfiler();
25 |   ~LegacyCuptiProfiler();
26 |   friend class CuptiProfiler;
27 | 
28 |   class Impl;
29 |   std::unique_ptr<Impl> impl_;
30 | };
31 | 
32 | }
33 | }
34 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/metrics.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | 
 5 | namespace habitat {
 6 | namespace cuda {
 7 | 
 8 | class KernelMetric {
 9 |  public:
10 |   KernelMetric(std::string kernel_name, double metric_value)
11 |     : kernel_name_(std::move(kernel_name)),
12 |       metric_value_(metric_value) {}
13 | 
14 |   const std::string& kernelName() const {
15 |     return kernel_name_;
16 |   }
17 | 
18 |   const double& metricValue() const {
19 |     return metric_value_;
20 |   }
21 | 
22 |  private:
23 |   std::string kernel_name_;
24 |   double metric_value_;
25 | };
26 | 
27 | }
28 | }
29 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/new_cupti_profiler.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <functional>
 4 | #include <string>
 5 | #include <unordered_map>
 6 | 
 7 | #include "cupti_profiler.h"
 8 | #include "kernel.h"
 9 | #include "metrics.h"
10 | 
11 | namespace habitat {
12 | namespace cuda {
13 | 
14 | /**
15 |  * Uses the new PerfWorks-based CUPTI profiling APIs (Volta and newer).
16 |  */
17 | class NewCuptiProfiler : public CuptiProfiler {
18 |  public:
19 |   void profile(
20 |       const std::string& metric_name,
21 |       std::function<void(void)> runnable,
22 |       std::vector<KernelInstance>& kernels) const override;
23 | 
24 |  private:
25 |   NewCuptiProfiler();
26 |   ~NewCuptiProfiler();
27 |   friend class CuptiProfiler;
28 | 
29 |   // We lazily initialize the profiler to prevent CUPTI from potentially
30 |   // introducing overhead when profiling NOT is used.
31 |   void initialize() const;
32 | 
33 |   class State;
34 |   mutable std::unique_ptr<State> state_;
35 | };
36 | 
37 | }
38 | }
39 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/sampled_measurement.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdexcept>
 4 | #include <algorithm>
 5 | 
 6 | namespace habitat {
 7 | namespace cuda {
 8 | 
 9 | template <typename T>
10 | struct SampledMeasurement {
11 |   SampledMeasurement(T median, T min, T max) : median(median), min(min), max(max) {
12 |     if (min > max || max < median || min > median) {
13 |       throw std::runtime_error("Invalid values passed into SampledMeasurement.");
14 |     }
15 |   }
16 |   explicit SampledMeasurement(T value) : SampledMeasurement(value, value, value) {}
17 | 
18 |   template <typename V, typename Mapper>
19 |   static SampledMeasurement<T> fromValues(std::vector<V>& values, Mapper mapper) {
20 |     std::vector<T> mapped_values;
21 |     mapped_values.resize(values.size());
22 |     std::transform(values.begin(), values.end(), mapped_values.begin(), mapper);
23 |     return SampledMeasurement<T>::fromValues(mapped_values);
24 |   }
25 | 
26 |   static SampledMeasurement<T> fromValues(std::vector<T>& values) {
27 |     std::sort(values.begin(), values.end());
28 |     size_t mid = values.size() / 2;
29 | 
30 |     if (values.size() % 2 == 0) {
31 |       T mid1 = values.at(mid);
32 |       T mid2 = values.at(mid - 1);
33 |       return SampledMeasurement<T>((mid1 + mid2) / 2, values.front(), values.back());
34 | 
35 |     } else {
36 |       return SampledMeasurement<T>(values.at(mid), values.front(), values.back());
37 |     }
38 |   }
39 | 
40 |   T median;
41 |   T min;
42 |   T max;
43 | };
44 | 
45 | }
46 | }
47 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/utils-inl.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace habitat {
 4 | namespace utils {
 5 | 
 6 | namespace detail {
 7 | 
 8 | // Combining hashes from: https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x
 9 | inline void hash_combine(std::size_t& seed) {}
10 | 
11 | template <typename T, typename... Rest>
12 | inline void hash_combine(std::size_t& seed, const T& v, Rest... rest) {
13 |   std::hash<T> hasher;
14 |   seed ^= hasher(v) + 0x9e3779b9 + (seed<<6) + (seed>>2);
15 |   hash_combine(seed, rest...);
16 | }
17 | 
18 | }
19 | 
20 | template<typename... Values>
21 | inline std::size_t hash_combine(Values... values) {
22 |   std::size_t seed = 0;
23 |   detail::hash_combine(seed, values...);
24 |   return seed;
25 | }
26 | 
27 | }
28 | }
29 | 


--------------------------------------------------------------------------------
/cpp/src/cuda/utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace habitat {
 4 | namespace utils {
 5 | 
 6 | /**
 7 |  * Hashes and combines several values together.
 8 |  *
 9 |  * Usage:
10 |  * std::size_t hash = habitat::utils::hash_combine("hello", "world", 1337);
11 |  *
12 |  */
13 | template<typename... Values>
14 | inline std::size_t hash_combine(Values... values);
15 | 
16 | }
17 | }
18 | 
19 | #include "utils-inl.h"
20 | 


--------------------------------------------------------------------------------
/cpp/src/device_info.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <cstring>
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include <gflags/gflags.h>
 6 | 
 7 | DEFINE_int32(device, 0, "The ID of the device for which information should be extracted.");
 8 | 
 9 | int main(int argc, char* argv[]) {
10 |   std::string usage("Utility that extracts usage information about the GPU(s) on this machine.\nUsage: ");
11 |   usage += argv[0];
12 |   gflags::SetUsageMessage(usage);
13 |   gflags::SetVersionString("0.1.0");
14 |   gflags::ParseCommandLineFlags(&argc, &argv, /* remove_flags */ true);
15 | 
16 |   cudaDeviceProp props;
17 |   memset(&props, 0, sizeof(cudaDeviceProp));
18 |   cudaGetDeviceProperties(&props, FLAGS_device);
19 | 
20 |   std::cout << "compute_major: " << props.major << std::endl;
21 |   std::cout << "compute_minor: " << props.minor << std::endl;
22 |   std::cout << "max_threads_per_block: " << props.maxThreadsPerBlock << std::endl;
23 |   std::cout << "max_threads_per_multiprocessor: " << props.maxThreadsPerMultiProcessor << std::endl;
24 |   std::cout << "regs_per_block: " << props.regsPerBlock << std::endl;
25 |   std::cout << "regs_per_multiprocessor: " << props.regsPerMultiprocessor << std::endl;
26 |   std::cout << "warp_size: " << props.warpSize << std::endl;
27 |   std::cout << "shared_mem_per_block: " << props.sharedMemPerBlock << std::endl;
28 |   std::cout << "shared_mem_per_multiprocessor: " << props.sharedMemPerMultiprocessor << std::endl;
29 |   std::cout << "num_sms: " << props.multiProcessorCount << std::endl;
30 |   std::cout << "shared_mem_per_block_optin: " << props.sharedMemPerBlockOptin << std::endl;
31 | 
32 |   return 0;
33 | }
34 | 


--------------------------------------------------------------------------------
/cpp/src/frontend/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | target_sources(habitat_cuda
2 |   PRIVATE
3 |     model_bindings.cpp
4 |     profiler.cpp
5 | )
6 | 


--------------------------------------------------------------------------------
/cpp/src/frontend/model_bindings.cpp:
--------------------------------------------------------------------------------
 1 | #include "model_bindings.h"
 2 | 
 3 | #include <pybind11/pybind11.h>
 4 | #include <pybind11/stl.h>
 5 | 
 6 | #include "../cuda/kernel.h"
 7 | 
 8 | namespace py = pybind11;
 9 | using habitat::cuda::DeviceProperties;
10 | using habitat::cuda::KernelInstance;
11 | using habitat::cuda::KernelMetadata;
12 | 
13 | namespace habitat {
14 | namespace frontend {
15 | 
16 | void bindModels(pybind11::module& m) {
17 |   py::class_<DeviceProperties>(m, "DeviceProperties")
18 |     .def(
19 |         py::init<std::string, int, int, int, int, int, int, int, size_t, size_t, int, size_t, int, size_t, size_t>(),
20 |         py::arg("name"),
21 |         py::arg("compute_major"),
22 |         py::arg("compute_minor"),
23 |         py::arg("max_threads_per_block"),
24 |         py::arg("max_threads_per_multiprocessor"),
25 |         py::arg("regs_per_block"),
26 |         py::arg("regs_per_multiprocessor"),
27 |         py::arg("warp_size"),
28 |         py::arg("shared_mem_per_block"),
29 |         py::arg("shared_mem_per_multiprocessor"),
30 |         py::arg("num_sms"),
31 |         py::arg("shared_mem_per_block_optin"),
32 |         py::arg("mem_bandwidth_gb"),
33 |         py::arg("base_clock_mhz"),
34 |         py::arg("peak_gflops_per_second"))
35 |     .def("__repr__", [](const DeviceProperties& self) {
36 |       return std::string("DeviceProperties(name=" + self.name + ")");
37 |     }, py::return_value_policy::move)
38 |     .def_property_readonly("name", [](const DeviceProperties& self) {
39 |       return self.name;
40 |     })
41 |     .def_property_readonly("num_sms", [](const DeviceProperties& self) {
42 |       return self.num_sms;
43 |     })
44 |     .def_property_readonly("mem_bandwidth_gb", [](const DeviceProperties& self) {
45 |       return self.mem_bandwidth_gb;
46 |     })
47 |     .def_property_readonly("compute_capability", [](const DeviceProperties& self) {
48 |       return py::make_tuple(self.compute_major, self.compute_minor);
49 |     })
50 |     .def_property_readonly("base_clock_mhz", [](const DeviceProperties& self) {
51 |       return self.base_clock_mhz;
52 |     })
53 |     .def_property_readonly("peak_gflops_per_second", [](const DeviceProperties& self) {
54 |       return self.peak_gflops_per_second;
55 |     });
56 | 
57 |   py::class_<KernelInstance>(m, "KernelInstance")
58 |     .def_property_readonly("name", [](const KernelInstance& self) {
59 |       return self.metadata().name();
60 |     }, py::return_value_policy::reference_internal)
61 |     .def_property_readonly("run_time_ns", &KernelInstance::runTimeNs, py::return_value_policy::reference_internal)
62 |     .def_property_readonly("num_blocks", [](const KernelInstance& self) { return self.metadata().numBlocks(); })
63 |     .def_property_readonly("metrics", &KernelInstance::metrics, py::return_value_policy::reference_internal)
64 |     .def_property_readonly("metadata", [](const KernelInstance& self) {
65 |       py::dict metadata;
66 |       const KernelMetadata& kernel_metadata = self.metadata();
67 |       metadata["name"] = kernel_metadata.name();
68 |       metadata["num_blocks"] = kernel_metadata.numBlocks();
69 |       metadata["block_size"] = kernel_metadata.blockSize();
70 |       metadata["static_shared_memory"] = kernel_metadata.staticSharedMemory();
71 |       metadata["dynamic_shared_memory"] = kernel_metadata.dynamicSharedMemory();
72 |       metadata["registers_per_thread"] = kernel_metadata.registersPerThread();
73 |       return metadata;
74 |     })
75 |     .def("thread_block_occupancy", [](
76 |         const KernelInstance& self, const DeviceProperties& device, int registers_per_thread) {
77 |       if (registers_per_thread < 0) {
78 |         return self.metadata().threadBlockOccupancy(device);
79 |       } else {
80 |         return self.metadata().threadBlockOccupancy(device, registers_per_thread);
81 |       }
82 |     }, py::arg("device"), py::arg("registers_per_thread") = -1);
83 | }
84 | 
85 | }
86 | }
87 | 


--------------------------------------------------------------------------------
/cpp/src/frontend/model_bindings.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <pybind11/pybind11.h>
 4 | 
 5 | namespace habitat {
 6 | namespace frontend {
 7 | 
 8 | void bindModels(pybind11::module& m);
 9 | 
10 | }
11 | }
12 | 


--------------------------------------------------------------------------------
/cpp/src/frontend/profiler.cpp:
--------------------------------------------------------------------------------
 1 | #include "profiler.h"
 2 | 
 3 | #include "../cuda/habitat_cupti.h"
 4 | 
 5 | using habitat::cuda::CuptiManager;
 6 | using habitat::cuda::CuptiTracer;
 7 | using habitat::cuda::KernelInstance;
 8 | 
 9 | namespace {
10 | 
11 | std::vector<KernelInstance> measureRunTimes(
12 |     const std::function<void()>& runnable) {
13 |   std::vector<KernelInstance> kernels;
14 |   {
15 |     CuptiTracer::Ptr tracer = CuptiManager::instance().allocateTracer();
16 |     runnable();
17 |     kernels = std::move(*tracer).kernels();
18 |   }
19 |   return kernels;
20 | }
21 | 
22 | }
23 | 
24 | namespace habitat {
25 | namespace frontend {
26 | 
27 | void setCacheMetrics(bool should_cache) {
28 |   CuptiManager::instance().setCacheMetrics(should_cache);
29 | }
30 | 
31 | std::vector<KernelInstance> profile(std::function<void()> runnable) {
32 |   return measureRunTimes(runnable);
33 | }
34 | 
35 | std::vector<KernelInstance> profile(
36 |     std::function<void()> runnable, const std::string& metric) {
37 |   std::vector<cuda::KernelInstance> kernels(measureRunTimes(runnable));
38 |   CuptiManager::instance().measureMetric(metric, std::move(runnable), kernels);
39 |   return kernels;
40 | }
41 | 
42 | }
43 | }
44 | 


--------------------------------------------------------------------------------
/cpp/src/frontend/profiler.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <functional>
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | #include "../cuda/kernel.h"
 8 | 
 9 | namespace habitat {
10 | namespace frontend {
11 | 
12 | void setCacheMetrics(bool should_cache);
13 | 
14 | std::vector<cuda::KernelInstance> profile(std::function<void()> runnable);
15 | 
16 | std::vector<cuda::KernelInstance> profile(
17 |     std::function<void()> runnable, const std::string& metric);
18 | 
19 | }
20 | }
21 | 


--------------------------------------------------------------------------------
/cpp/src/habitat_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include <pybind11/pybind11.h>
 2 | #include <pybind11/stl.h>
 3 | 
 4 | #include <functional>
 5 | 
 6 | #include "cuda/diagnostics.h"
 7 | #include "frontend/model_bindings.h"
 8 | #include "frontend/profiler.h"
 9 | 
10 | namespace py = pybind11;
11 | 
12 | PYBIND11_MODULE(habitat_cuda, m) {
13 |   habitat::frontend::bindModels(m);
14 | 
15 |   m.def("profile", [](py::function runnable_python, const std::string& metric) {
16 |     std::function<void()> runnable = [runnable_python]() {
17 |       runnable_python();
18 |     };
19 |     if (metric.size() == 0) {
20 |       return habitat::frontend::profile(std::move(runnable));
21 |     } else {
22 |       return habitat::frontend::profile(std::move(runnable), metric);
23 |     }
24 |   }, py::arg("runnable"), py::arg("metric") = "", py::return_value_policy::move);
25 | 
26 |   m.def("set_cache_metrics", [](bool should_cache) {
27 |     habitat::frontend::setCacheMetrics(should_cache);
28 |   }, py::arg("should_cache"));
29 | 
30 |   m.def_submodule("_diagnostics")
31 |     .def("run_flop_test", [](size_t num_blocks, size_t threads_per_block) {
32 |       habitat::cuda::diagnostics::run_flop_test(num_blocks, threads_per_block);
33 |     }, py::arg("num_blocks") = 8192, py::arg("threads_per_block") = 256);
34 | }
35 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
 2 | MAINTAINER Geoffrey Yu <gxyu@cs.toronto.edu>
 3 | 
 4 | RUN apt-get update --fix-missing && \
 5 |   apt-get install --no-install-recommends -y software-properties-common && \
 6 |   apt-get update && \
 7 |   apt-get install --no-install-recommends -y sudo && \
 8 |   apt-get install --no-install-recommends -y python3-pip python3-setuptools python3-dev && \
 9 |   apt-get install --no-install-recommends -y wget bzip2 ca-certificates libssl-dev && \
10 |   rm -rf /var/lib/apt/lists/*
11 | 
12 | RUN pip3 install wheel && pip3 install numpy PyYAML
13 | RUN pip3 install \
14 |   torch==1.4.0 \
15 |   pillow==7.2.0 \
16 |   torchvision==0.5.0 \
17 |   pandas==1.1.2 \
18 |   tqdm==4.49.0
19 | 
20 | # Download cmake
21 | RUN wget "https://github.com/Kitware/CMake/releases/download/v3.17.0-rc1/cmake-3.17.0-rc1.tar.gz" -O /opt/cmake-3.17.0-rc1.tar.gz && \
22 |   cd /opt && tar xzf cmake-3.17.0-rc1.tar.gz
23 | 
24 | # Install cmake
25 | RUN cd /opt/cmake-3.17.0-rc1 && \
26 |   ./bootstrap && \
27 |   make -j 16 && \
28 |   make install
29 | 
30 | # NOTE: gosu is used in create-user.sh
31 | RUN mkdir ~/.gnupg && echo "disable-ipv6" >> ~/.gnupg/dirmngr.conf
32 | RUN gpg --keyserver keyserver.ubuntu.com --recv-keys B42F6819007F00F88E364FD4036A9C25BF357DD4
33 | RUN wget "https://github.com/tianon/gosu/releases/download/1.11/gosu-$(dpkg --print-architecture | awk -F- '{ print $NF }')" -O /usr/local/bin/gosu && \
34 |   wget "https://github.com/tianon/gosu/releases/download/1.11/gosu-$(dpkg --print-architecture | awk -F- '{ print $NF }').asc" -O /usr/local/bin/gosu.asc && \
35 |   gpg --verify /usr/local/bin/gosu.asc && \
36 |   rm /usr/local/bin/gosu.asc && \
37 |   chmod +x /usr/local/bin/gosu
38 | 
39 | COPY create-user.sh /usr/local/bin/create-user.sh
40 | RUN chmod +x /usr/local/bin/create-user.sh
41 | ENTRYPOINT ["/usr/local/bin/create-user.sh"]
42 | CMD ["/bin/bash"]
43 | 


--------------------------------------------------------------------------------
/docker/README.md:
--------------------------------------------------------------------------------
 1 | Habitat Docker Image
 2 | =====================
 3 | The Dockerfile in this directory specifies a Docker image that is used as our
 4 | development and test environment. Run `setup.sh` to build the Docker image.
 5 | 
 6 | To start a container, run the `start.sh` script.  The container is set up so
 7 | that your current account is duplicated inside the container (with the same
 8 | user ID and username). This prevents permission issues when accessing files in
 9 | mounted volumes inside and outside the container. The user inside the container
10 | will have `sudo` permissions; the account's password will be set to your
11 | username.
12 | 
13 | Note that the `start.sh` script will restart any containers that are stopped
14 | but have not been removed. If you make any changes to the Docker image and/or
15 | want to start a new container, you need to remove the existing container with
16 | `docker rm` before running `start.sh` again.
17 | 


--------------------------------------------------------------------------------
/docker/create-user.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This script is executed each time the container is started to create a user
 4 | 
 5 | if [ -z $CONTAINER_UID ] || [ -z $CONTAINER_UNAME ]; then
 6 |   echo "Please set the \"CONTAINER_UID\" and \"CONTAINER_UNAME\" environment variables."
 7 |   exit 1
 8 | fi
 9 | 
10 | # Create the user if they do not exist
11 | if ! id -u ${CONTAINER_UNAME} &> /dev/null; then
12 |   # NOTE: The home directory is automatically created because we mount stuff inside it
13 |   useradd --shell /bin/bash -u ${CONTAINER_UID} ${CONTAINER_UNAME} && \
14 |     adduser ${CONTAINER_UNAME} sudo && \
15 |     echo "${CONTAINER_UNAME}:${CONTAINER_UNAME}" | chpasswd
16 | 
17 |   export HOME=/home/${CONTAINER_UNAME}
18 |   echo "cd /home/${CONTAINER_UNAME}" >> /home/${CONTAINER_UNAME}/.bashrc
19 |   echo "alias ls=\"ls --color\"" >> /home/${CONTAINER_UNAME}/.bashrc
20 |   chown ${CONTAINER_UNAME}:${CONTAINER_UNAME} /home/${CONTAINER_UNAME}
21 |   chown ${CONTAINER_UNAME}:${CONTAINER_UNAME} /home/${CONTAINER_UNAME}/.bashrc
22 | fi
23 | 
24 | exec /usr/local/bin/gosu ${CONTAINER_UNAME} "$@"
25 | 


--------------------------------------------------------------------------------
/docker/setup.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | SCRIPT_LOCATION=$(cd $(dirname $0) && pwd -P)
 3 | source $SCRIPT_LOCATION/vars.sh
 4 | 
 5 | echo "This script will build a habitat container image."
 6 | echo ""
 7 | read -p "Do you want to continue? (y/n) " -r
 8 | if [[ ! $REPLY =~ ^[Yy]$ ]]
 9 | then
10 |   exit 1
11 | fi
12 | 
13 | docker build -t $IMAGE_NAME .
14 | 


--------------------------------------------------------------------------------
/docker/start.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # This script drops you into a habitat container. The container image must
 4 | # exist. If it does not exist, build it first using the setup.sh script.
 5 | #
 6 | # If no containers exist (stopped or running), this script will run a new container.
 7 | # You can optionally pass in another directory to mount as an argument to this script.
 8 | # If no arguments are provided to the script, your home directory will be mounted
 9 | # inside the container at ~/home.
10 | #
11 | # If a running container exists, this script will just start a new bash shell inside
12 | # the container.
13 | #
14 | # If a stopped container exists, this script will restart it.
15 | 
16 | SCRIPT_LOCATION=$(cd $(dirname $0) && pwd -P)
17 | source $SCRIPT_LOCATION/vars.sh
18 | 
19 | UNAME=$(id -un)
20 | CONTAINER_NAME=$IMAGE_NAME-$UNAME
21 | 
22 | if [ -z $1 ]; then
23 |   MOUNT_VOL=$(cd ~ && pwd):/home/$UNAME/home
24 | else
25 |   ABS_MOUNT_DIR=$(cd $1 && pwd)
26 |   MOUNT_DIR=$(basename $ABS_MOUNT_DIR)
27 |   MOUNT_VOL=$ABS_MOUNT_DIR:/home/$UNAME/$MOUNT_DIR
28 | fi
29 | 
30 | RUNNING=$(docker ps --filter "status=running" --filter "name=$CONTAINER_NAME" --format "{{.ID}}")
31 | EXITED=$(docker ps --filter "status=exited" --filter "name=$CONTAINER_NAME" --format "{{.ID}}")
32 | 
33 | if [ -z "$RUNNING" ] && [ -z "$EXITED" ]; then
34 |   # Container was never started
35 | 
36 |   # NOTE: For stable all reduce measurements, it's important to ensure that the
37 |   #       shared memory limits are increased using
38 |   #
39 |   #         --shm-size=1g --ulimit memlock=-1
40 |   #
41 |   docker run -ti \
42 |     -e "CONTAINER_UID=$(id -u)" \
43 |     -e "CONTAINER_UNAME=$(id -un)" \
44 |     --name $CONTAINER_NAME \
45 |     --volume $MOUNT_VOL \
46 |     --runtime=nvidia \
47 |     --workdir=/home/$UNAME \
48 |     --shm-size=1g \
49 |     --ulimit memlock=-1 \
50 |     $IMAGE_NAME
51 | elif [ -z "$RUNNING" ]; then
52 |   # Container exited but was not removed. We can restart it.
53 |   docker start -ai $EXITED
54 | else
55 |   # Already running, so just attach
56 |   docker exec -it $CONTAINER_NAME \
57 |     /usr/local/bin/gosu \
58 |     $UNAME \
59 |     /bin/bash
60 | fi
61 | 


--------------------------------------------------------------------------------
/docker/vars.sh:
--------------------------------------------------------------------------------
1 | VARS_FILE_LOCATION=$(cd $(dirname $0) && pwd -P)
2 | IMAGE_NAME=habitat-cuda10.1-cudnn7
3 | 


--------------------------------------------------------------------------------
/experiments/.gitignore:
--------------------------------------------------------------------------------
1 | results
2 | 


--------------------------------------------------------------------------------
/experiments/dcgan/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2017, 
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/experiments/dcgan/README.md:
--------------------------------------------------------------------------------
 1 | # DCGAN
 2 | 
 3 | The code in this directory is adapted from the PyTorch DCGAN example that can
 4 | be found [here](https://github.com/pytorch/examples/tree/master/dcgan). As a
 5 | result, it is also licensed under the BSD-3 license (see `LICENSE`).
 6 | 
 7 | The original README can be found below.
 8 | 
 9 | -------------------------------------------
10 | 
11 | # Deep Convolution Generative Adversarial Networks
12 | 
13 | This example implements the paper [Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](http://arxiv.org/abs/1511.06434)
14 | 
15 | The implementation is very close to the Torch implementation [dcgan.torch](https://github.com/soumith/dcgan.torch)
16 | 
17 | After every 100 training iterations, the files `real_samples.png` and `fake_samples.png` are written to disk
18 | with the samples from the generative model.
19 | 
20 | After every epoch, models are saved to: `netG_epoch_%d.pth` and `netD_epoch_%d.pth`
21 | 
22 | ## Downloading the dataset
23 | You can download the LSUN dataset by cloning [this repo](https://github.com/fyu/lsun) and running
24 | ```
25 | python download.py -c bedroom
26 | ```
27 | 
28 | ## Usage
29 | ```
30 | usage: main.py [-h] --dataset DATASET --dataroot DATAROOT [--workers WORKERS]
31 |                [--batchSize BATCHSIZE] [--imageSize IMAGESIZE] [--nz NZ]
32 |                [--ngf NGF] [--ndf NDF] [--niter NITER] [--lr LR]
33 |                [--beta1 BETA1] [--cuda] [--ngpu NGPU] [--netG NETG]
34 |                [--netD NETD]
35 | 
36 | optional arguments:
37 |   -h, --help            show this help message and exit
38 |   --dataset DATASET     cifar10 | lsun | mnist |imagenet | folder | lfw | fake
39 |   --dataroot DATAROOT   path to dataset
40 |   --workers WORKERS     number of data loading workers
41 |   --batchSize BATCHSIZE input batch size
42 |   --imageSize IMAGESIZE the height / width of the input image to network
43 |   --nz NZ               size of the latent z vector
44 |   --ngf NGF
45 |   --ndf NDF
46 |   --niter NITER         number of epochs to train for
47 |   --lr LR               learning rate, default=0.0002
48 |   --beta1 BETA1         beta1 for adam. default=0.5
49 |   --cuda                enables cuda
50 |   --ngpu NGPU           number of GPUs to use
51 |   --netG NETG           path to netG (to continue training)
52 |   --netD NETD           path to netD (to continue training)
53 |   --outf OUTF           folder to output images and model checkpoints
54 |   --manualSeed SEED     manual seed
55 |   --classes CLASSES     comma separated list of classes for the lsun data set
56 | ```
57 | 


--------------------------------------------------------------------------------
/experiments/dcgan/dcgan.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import random
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.utils.data
  7 | 
  8 | 
  9 | def model_config():
 10 |     parser = argparse.ArgumentParser()
 11 |     parser.add_argument('--dataroot', required=False, help='path to dataset')
 12 |     parser.add_argument('--workers', type=int, help='number of data loading workers', default=2)
 13 |     parser.add_argument('--batchSize', type=int, default=64, help='input batch size')
 14 |     parser.add_argument('--imageSize', type=int, default=64, help='the height / width of the input image to network')
 15 |     parser.add_argument('--nz', type=int, default=100, help='size of the latent z vector')
 16 |     parser.add_argument('--ngf', type=int, default=64)
 17 |     parser.add_argument('--ndf', type=int, default=64)
 18 |     parser.add_argument('--niter', type=int, default=25, help='number of epochs to train for')
 19 |     parser.add_argument('--lr', type=float, default=0.0002, help='learning rate, default=0.0002')
 20 |     parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for adam. default=0.5')
 21 |     parser.add_argument('--cuda', action='store_true', help='enables cuda')
 22 |     parser.add_argument('--dry-run', action='store_true', help='check a single training cycle works')
 23 |     parser.add_argument('--ngpu', type=int, default=1, help='number of GPUs to use')
 24 |     parser.add_argument('--netG', default='', help="path to netG (to continue training)")
 25 |     parser.add_argument('--netD', default='', help="path to netD (to continue training)")
 26 |     parser.add_argument('--outf', default='.', help='folder to output images and model checkpoints')
 27 |     parser.add_argument('--manualSeed', type=int, help='manual seed')
 28 |     parser.add_argument('--classes', default='bedroom', help='comma separated list of classes for the lsun data set')
 29 | 
 30 |     opt = parser.parse_args(args=[])
 31 |     return opt
 32 | 
 33 | 
 34 | device = torch.device("cuda")
 35 | nz = 100
 36 | ngf = 64
 37 | ndf = 64
 38 | nc = 3
 39 | 
 40 | 
 41 | # custom weights initialization called on netG and netD
 42 | def weights_init(m):
 43 |     classname = m.__class__.__name__
 44 |     if classname.find('Conv') != -1:
 45 |         torch.nn.init.normal_(m.weight, 0.0, 0.02)
 46 |     elif classname.find('BatchNorm') != -1:
 47 |         torch.nn.init.normal_(m.weight, 1.0, 0.02)
 48 |         torch.nn.init.zeros_(m.bias)
 49 | 
 50 | 
 51 | class Generator(nn.Module):
 52 |     def __init__(self, ngpu):
 53 |         super(Generator, self).__init__()
 54 |         self.ngpu = ngpu
 55 |         self.main = nn.Sequential(
 56 |             # input is Z, going into a convolution
 57 |             nn.ConvTranspose2d(     nz, ngf * 8, 4, 1, 0, bias=False),
 58 |             nn.BatchNorm2d(ngf * 8),
 59 |             nn.ReLU(True),
 60 |             # state size. (ngf*8) x 4 x 4
 61 |             nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False),
 62 |             nn.BatchNorm2d(ngf * 4),
 63 |             nn.ReLU(True),
 64 |             # state size. (ngf*4) x 8 x 8
 65 |             nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False),
 66 |             nn.BatchNorm2d(ngf * 2),
 67 |             nn.ReLU(True),
 68 |             # state size. (ngf*2) x 16 x 16
 69 |             nn.ConvTranspose2d(ngf * 2,     ngf, 4, 2, 1, bias=False),
 70 |             nn.BatchNorm2d(ngf),
 71 |             nn.ReLU(True),
 72 |             # state size. (ngf) x 32 x 32
 73 |             nn.ConvTranspose2d(    ngf,      nc, 4, 2, 1, bias=False),
 74 |             nn.Tanh()
 75 |             # state size. (nc) x 64 x 64
 76 |         )
 77 | 
 78 |     def forward(self, input):
 79 |         if input.is_cuda and self.ngpu > 1:
 80 |             output = nn.parallel.data_parallel(self.main, input, range(self.ngpu))
 81 |         else:
 82 |             output = self.main(input)
 83 |         return output
 84 | 
 85 | 
 86 | class Discriminator(nn.Module):
 87 |     def __init__(self, ngpu):
 88 |         super(Discriminator, self).__init__()
 89 |         self.ngpu = ngpu
 90 |         self.main = nn.Sequential(
 91 |             # input is (nc) x 64 x 64
 92 |             nn.Conv2d(nc, ndf, 4, 2, 1, bias=False),
 93 |             nn.LeakyReLU(0.2, inplace=True),
 94 |             # state size. (ndf) x 32 x 32
 95 |             nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False),
 96 |             nn.BatchNorm2d(ndf * 2),
 97 |             nn.LeakyReLU(0.2, inplace=True),
 98 |             # state size. (ndf*2) x 16 x 16
 99 |             nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False),
100 |             nn.BatchNorm2d(ndf * 4),
101 |             nn.LeakyReLU(0.2, inplace=True),
102 |             # state size. (ndf*4) x 8 x 8
103 |             nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False),
104 |             nn.BatchNorm2d(ndf * 8),
105 |             nn.LeakyReLU(0.2, inplace=True),
106 |             # state size. (ndf*8) x 4 x 4
107 |             nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False),
108 |             nn.Sigmoid()
109 |         )
110 | 
111 |     def forward(self, input):
112 |         if input.is_cuda and self.ngpu > 1:
113 |             output = nn.parallel.data_parallel(self.main, input, range(self.ngpu))
114 |         else:
115 |             output = self.main(input)
116 | 
117 |         return output.view(-1, 1).squeeze(1)
118 | 


--------------------------------------------------------------------------------
/experiments/dcgan/entry_point.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from . import dcgan
 5 | 
 6 | 
 7 | def skyline_model_provider(numgpu=1):
 8 |     netG = dcgan.Generator(numgpu).cuda()
 9 |     netG.apply(dcgan.weights_init)
10 |     netD = dcgan.Discriminator(numgpu).cuda()
11 |     netD.apply(dcgan.weights_init)
12 |     return netG, netD
13 | 
14 | 
15 | def skyline_input_provider(batch_size=64):
16 |     return (
17 |         batch_size,
18 |         torch.randn((batch_size, 3, 64, 64)).cuda(),
19 |     )
20 | 
21 | 
22 | def skyline_iteration_provider(netG, netD):
23 |     real_label = 1
24 |     fake_label = 0
25 |     opt = dcgan.model_config()
26 | 
27 |     optimizerD = torch.optim.Adam(netD.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
28 |     optimizerG = torch.optim.Adam(netG.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999))
29 | 
30 |     criterion = nn.BCELoss()
31 | 
32 |     device = torch.device("cuda")
33 | 
34 |     def iteration(*inputs):
35 |         #for i, data in enumerate(dataloader, 0):
36 |         ############################
37 |         # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
38 |         ###########################
39 |         batch_size, data = inputs
40 |         # train with real
41 |         netD.zero_grad()
42 |         real_cpu = data.to(device)
43 |         label = torch.full((batch_size,), real_label,
44 |                            dtype=real_cpu.dtype, device=device)
45 |         output = netD(real_cpu)
46 |         errD_real = criterion(output, label)
47 |         errD_real.backward()
48 | 
49 |         # train with fake
50 |         noise = torch.randn(batch_size, dcgan.nz, 1, 1, device=device)
51 |         fake = netG(noise)
52 |         label.fill_(fake_label)
53 |         output = netD(fake.detach())
54 |         errD_fake = criterion(output, label)
55 |         errD_fake.backward()
56 |         optimizerD.step()
57 | 
58 |         ############################
59 |         # (2) Update G network: maximize log(D(G(z)))
60 |         ###########################
61 |         netG.zero_grad()
62 |         label.fill_(real_label)  # fake labels are real for generator cost
63 |         output = netD(fake)
64 |         errG = criterion(output, label)
65 |         errG.backward()
66 |         optimizerG.step()
67 |     return iteration
68 | 


--------------------------------------------------------------------------------
/experiments/gather_raw_data.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | # Operate out of the script directory
 6 | SCRIPT_PATH=$(cd $(dirname $0) && pwd -P)
 7 | cd $SCRIPT_PATH
 8 | 
 9 | if [ -z "$1" ]; then
10 |   echo "Usage: $0 <device>"
11 |   exit 1
12 | fi
13 | 
14 | python3 run_experiment.py $1
15 | tar cvzf hv2-$1.tar.gz *.csv
16 | rm *.csv
17 | 


--------------------------------------------------------------------------------
/experiments/gnmt/README.md:
--------------------------------------------------------------------------------
 1 | # GNMT (Google Neural Machine Translation) Model
 2 | 
 3 | This directory contains an implementation of GNMT that was adapted from the
 4 | code found in the [MLPerf training
 5 | repository](https://github.com/mlperf/training/tree/master/rnn_translator).
 6 | 
 7 | ## License
 8 | 
 9 | This code, with the exception of the `skyline_` prefixed functions in
10 | `entry_point.py`, was adapted from the MLPerf training benchmarks and therefore
11 | shares the same license. The unmodified license can be found in the `LICENSE`
12 | file in the `seq2seq` directory.
13 | 


--------------------------------------------------------------------------------
/experiments/gnmt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoffxy/habitat/5f01e523a1dc30dbfbaaa39cf4880a534c7781a2/experiments/gnmt/__init__.py


--------------------------------------------------------------------------------
/experiments/gnmt/seq2seq/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Elad Hoffer
 4 | Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/experiments/gnmt/seq2seq/data/config.py:
--------------------------------------------------------------------------------
 1 | PAD_TOKEN = '<pad>'
 2 | UNK_TOKEN = '<unk>'
 3 | BOS_TOKEN = '<s>'
 4 | EOS_TOKEN = '<\s>'
 5 | 
 6 | # special PAD, UNKNOWN, BEGIN-OF-STRING, END-OF-STRING tokens
 7 | PAD, UNK, BOS, EOS = [0, 1, 2, 3]
 8 | 
 9 | # path to the BPE vocabulary file, relative to the data directory, it should
10 | # point to file generated by subword-nmt/get_vocab.py
11 | VOCAB_FNAME = 'vocab.bpe.32000'
12 | 
13 | # paths to source and target training files, relative to the data directory, it
14 | # should point to BPE-encoded files, generated by subword-nmt/apply_bpe.py
15 | SRC_TRAIN_FNAME = 'train.tok.clean.bpe.32000.en'
16 | TGT_TRAIN_FNAME = 'train.tok.clean.bpe.32000.de'
17 | 
18 | # paths to source and target validation files, relative to the data directory,
19 | # it should point to BPE-encoded files, generated by subword-nmt/apply_bpe.py
20 | SRC_VAL_FNAME = 'newstest_dev.tok.clean.bpe.32000.en'
21 | TGT_VAL_FNAME = 'newstest_dev.tok.clean.bpe.32000.de'
22 | 
23 | # path to the test source file, relative to the data directory, it should point
24 | # to BPE-encoded file, generated by subword-nmt/apply_bpe.py
25 | SRC_TEST_FNAME = 'newstest2014.tok.bpe.32000.en'
26 | 
27 | # path to the test target file, relative to the data directory, it should point
28 | # to plaintext file, tokenization is performed by the sacrebleu package
29 | TGT_TEST_TARGET_FNAME = 'newstest2014.de'
30 | 
31 | # path to the moses detokenizer, relative to the data directory
32 | DETOKENIZER = 'mosesdecoder/scripts/tokenizer/detokenizer.perl'
33 | 


--------------------------------------------------------------------------------
/experiments/gnmt/seq2seq/data/tokenizer.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from collections import defaultdict
  3 | from functools import partial
  4 | 
  5 | import gnmt.seq2seq.data.config as config
  6 | 
  7 | 
  8 | class Tokenizer:
  9 |     """
 10 |     Tokenizer class.
 11 |     """
 12 |     def __init__(self, vocab_fname=None, pad=1, separator='@@'):
 13 |         """
 14 |         Constructor for the Tokenizer class.
 15 | 
 16 |         :param vocab_fname: path to the file with vocabulary
 17 |         :param pad: pads vocabulary to a multiple of 'pad' tokens
 18 |         :param separator: tokenization separator
 19 |         """
 20 |         if vocab_fname:
 21 |             self.separator = separator
 22 | 
 23 |             logging.info(f'Building vocabulary from {vocab_fname}')
 24 |             vocab = [config.PAD_TOKEN, config.UNK_TOKEN,
 25 |                      config.BOS_TOKEN, config.EOS_TOKEN]
 26 | 
 27 |             with open(vocab_fname) as vfile:
 28 |                 for line in vfile:
 29 |                     vocab.append(line.strip())
 30 | 
 31 |             self.pad_vocabulary(vocab, pad)
 32 | 
 33 |             self.vocab_size = len(vocab)
 34 |             logging.info(f'Size of vocabulary: {self.vocab_size}')
 35 | 
 36 |             self.tok2idx = defaultdict(partial(int, config.UNK))
 37 |             for idx, token in enumerate(vocab):
 38 |                 self.tok2idx[token] = idx
 39 | 
 40 |             self.idx2tok = {}
 41 |             for key, value in self.tok2idx.items():
 42 |                 self.idx2tok[value] = key
 43 | 
 44 |     def pad_vocabulary(self, vocab, pad):
 45 |         """
 46 |         Pads vocabulary to a multiple of 'pad' tokens.
 47 | 
 48 |         :param vocab: list with vocabulary
 49 |         :param pad: integer
 50 |         """
 51 |         vocab_size = len(vocab)
 52 |         padded_vocab_size = (vocab_size + pad - 1) // pad * pad
 53 |         for i in range(0, padded_vocab_size - vocab_size):
 54 |             token = f'madeupword{i:04d}'
 55 |             vocab.append(token)
 56 |         assert len(vocab) % pad == 0
 57 | 
 58 |     def get_state(self):
 59 |         logging.info(f'Saving state of the tokenizer')
 60 |         state = {
 61 |             'separator': self.separator,
 62 |             'vocab_size': self.vocab_size,
 63 |             'tok2idx': self.tok2idx,
 64 |             'idx2tok': self.idx2tok,
 65 |         }
 66 |         return state
 67 | 
 68 |     def set_state(self, state):
 69 |         logging.info(f'Restoring state of the tokenizer')
 70 |         self.separator = state['separator']
 71 |         self.vocab_size = state['vocab_size']
 72 |         self.tok2idx = state['tok2idx']
 73 |         self.idx2tok = state['idx2tok']
 74 | 
 75 |     def segment(self, line):
 76 |         """
 77 |         Tokenizes single sentence and adds special BOS and EOS tokens.
 78 | 
 79 |         :param line: sentence
 80 | 
 81 |         returns: list representing tokenized sentence
 82 |         """
 83 |         line = line.strip().split()
 84 |         entry = [self.tok2idx[i] for i in line]
 85 |         entry = [config.BOS] + entry + [config.EOS]
 86 |         return entry
 87 | 
 88 |     def detokenize(self, inputs, delim=' '):
 89 |         """
 90 |         Detokenizes single sentence and removes token separator characters.
 91 | 
 92 |         :param inputs: sequence of tokens
 93 |         :param delim: tokenization delimiter
 94 | 
 95 |         returns: string representing detokenized sentence
 96 |         """
 97 |         detok = delim.join([self.idx2tok[idx] for idx in inputs])
 98 |         detok = detok.replace(self.separator + ' ', '')
 99 |         detok = detok.replace(self.separator, '')
100 | 
101 |         detok = detok.replace(config.BOS_TOKEN, '')
102 |         detok = detok.replace(config.EOS_TOKEN, '')
103 |         detok = detok.replace(config.PAD_TOKEN, '')
104 |         detok = detok.strip()
105 |         return detok
106 | 


--------------------------------------------------------------------------------
/experiments/gnmt/seq2seq/models/encoder.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from torch.nn.utils.rnn import pack_padded_sequence
 3 | from torch.nn.utils.rnn import pad_packed_sequence
 4 | 
 5 | import gnmt.seq2seq.data.config as config
 6 | from gnmt.seq2seq.utils import init_lstm_
 7 | 
 8 | 
 9 | class ResidualRecurrentEncoder(nn.Module):
10 |     """
11 |     Encoder with Embedding, LSTM layers, residual connections and optional
12 |     dropout.
13 | 
14 |     The first LSTM layer is bidirectional and uses variable sequence length
15 |     API, the remaining (num_layers-1) layers are unidirectional. Residual
16 |     connections are enabled after third LSTM layer, dropout is applied on
17 |     inputs to LSTM layers.
18 |     """
19 |     def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2,
20 |                  batch_first=False, embedder=None, init_weight=0.1):
21 |         """
22 |         Constructor for the ResidualRecurrentEncoder.
23 | 
24 |         :param vocab_size: size of vocabulary
25 |         :param hidden_size: hidden size for LSTM layers
26 |         :param num_layers: number of LSTM layers, 1st layer is bidirectional
27 |         :param dropout: probability of dropout (on input to LSTM layers)
28 |         :param batch_first: if True the model uses (batch,seq,feature) tensors,
29 |             if false the model uses (seq, batch, feature)
30 |         :param embedder: instance of nn.Embedding, if None constructor will
31 |             create new embedding layer
32 |         :param init_weight: range for the uniform initializer
33 |         """
34 |         super(ResidualRecurrentEncoder, self).__init__()
35 |         self.batch_first = batch_first
36 |         self.rnn_layers = nn.ModuleList()
37 |         # 1st LSTM layer, bidirectional
38 |         self.rnn_layers.append(
39 |             nn.LSTM(hidden_size, hidden_size, num_layers=1, bias=True,
40 |                     batch_first=batch_first, bidirectional=True))
41 | 
42 |         # 2nd LSTM layer, with 2x larger input_size
43 |         self.rnn_layers.append(
44 |             nn.LSTM((2 * hidden_size), hidden_size, num_layers=1, bias=True,
45 |                     batch_first=batch_first))
46 | 
47 |         # Remaining LSTM layers
48 |         for _ in range(num_layers - 2):
49 |             self.rnn_layers.append(
50 |                 nn.LSTM(hidden_size, hidden_size, num_layers=1, bias=True,
51 |                         batch_first=batch_first))
52 | 
53 |         for lstm in self.rnn_layers:
54 |             init_lstm_(lstm, init_weight)
55 | 
56 |         self.dropout = nn.Dropout(p=dropout)
57 | 
58 |         if embedder is not None:
59 |             self.embedder = embedder
60 |         else:
61 |             self.embedder = nn.Embedding(vocab_size, hidden_size,
62 |                                          padding_idx=config.PAD)
63 |             nn.init.uniform_(self.embedder.weight.data, -init_weight, init_weight)
64 | 
65 |     def forward(self, inputs, lengths):
66 |         """
67 |         Execute the encoder.
68 | 
69 |         :param inputs: tensor with indices from the vocabulary
70 |         :param lengths: vector with sequence lengths (excluding padding)
71 | 
72 |         returns: tensor with encoded sequences
73 |         """
74 |         x = self.embedder(inputs)
75 | 
76 |         # bidirectional layer
77 |         x = self.dropout(x)
78 |         x = pack_padded_sequence(x, lengths.cpu().numpy(),
79 |                                  batch_first=self.batch_first)
80 |         x, _ = self.rnn_layers[0](x)
81 |         x, _ = pad_packed_sequence(x, batch_first=self.batch_first)
82 | 
83 |         # 1st unidirectional layer
84 |         x = self.dropout(x)
85 |         x, _ = self.rnn_layers[1](x)
86 | 
87 |         # the rest of unidirectional layers,
88 |         # with residual connections starting from 3rd layer
89 |         for i in range(2, len(self.rnn_layers)):
90 |             residual = x
91 |             x = self.dropout(x)
92 |             x, _ = self.rnn_layers[i](x)
93 |             x = x + residual
94 | 
95 |         return x
96 | 


--------------------------------------------------------------------------------
/experiments/gnmt/seq2seq/models/gnmt.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | import gnmt.seq2seq.data.config as config
 4 | from gnmt.seq2seq.models.decoder import ResidualRecurrentDecoder
 5 | from gnmt.seq2seq.models.encoder import ResidualRecurrentEncoder
 6 | from gnmt.seq2seq.models.seq2seq_base import Seq2Seq
 7 | from gnmt.seq2seq.utils import gnmt_print
 8 | 
 9 | 
10 | class GNMT(Seq2Seq):
11 |     """
12 |     GNMT v2 model
13 |     """
14 |     def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2,
15 |                  batch_first=False, share_embedding=True):
16 |         """
17 |         Constructor for the GNMT v2 model.
18 | 
19 |         :param vocab_size: size of vocabulary (number of tokens)
20 |         :param hidden_size: internal hidden size of the model
21 |         :param num_layers: number of layers, applies to both encoder and
22 |             decoder
23 |         :param dropout: probability of dropout (in encoder and decoder)
24 |         :param batch_first: if True the model uses (batch,seq,feature) tensors,
25 |             if false the model uses (seq, batch, feature)
26 |         :param share_embedding: if True embeddings are shared between encoder
27 |             and decoder
28 |         """
29 | 
30 |         super(GNMT, self).__init__(batch_first=batch_first)
31 | 
32 |         if share_embedding:
33 |             embedder = nn.Embedding(vocab_size, hidden_size,
34 |                                     padding_idx=config.PAD)
35 |             nn.init.uniform_(embedder.weight.data, -0.1, 0.1)
36 |         else:
37 |             embedder = None
38 | 
39 |         self.encoder = ResidualRecurrentEncoder(vocab_size, hidden_size,
40 |                                                 num_layers, dropout,
41 |                                                 batch_first, embedder)
42 | 
43 |         self.decoder = ResidualRecurrentDecoder(vocab_size, hidden_size,
44 |                                                 num_layers, dropout,
45 |                                                 batch_first, embedder)
46 | 
47 |     def forward(self, input_encoder, input_enc_len, input_decoder):
48 |         context = self.encode(input_encoder, input_enc_len)
49 |         context = (context, input_enc_len, None)
50 |         output, _, _ = self.decode(input_decoder, context)
51 | 
52 |         return output
53 | 


--------------------------------------------------------------------------------
/experiments/gnmt/seq2seq/models/seq2seq_base.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from torch.nn.functional import log_softmax
 3 | 
 4 | 
 5 | class Seq2Seq(nn.Module):
 6 |     """
 7 |     Generic Seq2Seq module, with an encoder and a decoder.
 8 |     """
 9 |     def __init__(self, encoder=None, decoder=None, batch_first=False):
10 |         """
11 |         Constructor for the Seq2Seq module.
12 | 
13 |         :param encoder: encoder module
14 |         :param decoder: decoder module
15 |         :param batch_first: if True the model uses (batch, seq, feature)
16 |             tensors, if false the model uses (seq, batch, feature) tensors
17 |         """
18 |         super(Seq2Seq, self).__init__()
19 |         self.encoder = encoder
20 |         self.decoder = decoder
21 |         self.batch_first = batch_first
22 | 
23 |     def encode(self, inputs, lengths):
24 |         """
25 |         Applies the encoder to inputs with a given input sequence lengths.
26 | 
27 |         :param inputs: tensor with inputs (batch, seq_len) if 'batch_first'
28 |             else (seq_len, batch)
29 |         :param lengths: vector with sequence lengths (excluding padding)
30 |         """
31 |         return self.encoder(inputs, lengths)
32 | 
33 |     def decode(self, inputs, context, inference=False):
34 |         """
35 |         Applies the decoder to inputs, given the context from the encoder.
36 | 
37 |         :param inputs: tensor with inputs (batch, seq_len) if 'batch_first'
38 |             else (seq_len, batch)
39 |         :param context: context from the encoder
40 |         :param inference: if True inference mode, if False training mode
41 |         """
42 |         return self.decoder(inputs, context, inference)
43 | 
44 |     def generate(self, inputs, context, beam_size):
45 |         """
46 |         Autoregressive generator, works with SequenceGenerator class.
47 |         Executes decoder (in inference mode), applies log_softmax and topK for
48 |         inference with beam search decoding.
49 | 
50 |         :param inputs: tensor with inputs to the decoder
51 |         :param context: context from the encoder
52 |         :param beam_size: beam size for the generator
53 | 
54 |         returns: (words, logprobs, scores, new_context)
55 |             words: indices of topK tokens
56 |             logprobs: log probabilities of topK tokens
57 |             scores: scores from the attention module (for coverage penalty)
58 |             new_context: new decoder context, includes new hidden states for
59 |                 decoder RNN cells
60 |         """
61 |         logits, scores, new_context = self.decode(inputs, context, True)
62 |         logprobs = log_softmax(logits, dim=-1)
63 |         logprobs, words = logprobs.topk(beam_size, dim=-1)
64 |         return words, logprobs, scores, new_context
65 | 


--------------------------------------------------------------------------------
/experiments/gnmt/seq2seq/train/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import math
 3 | 
 4 | import torch
 5 | 
 6 | from gnmt.seq2seq.utils import gnmt_print
 7 | 
 8 | 
 9 | def perhaps_convert_float(param, total):
10 |     if isinstance(param, float):
11 |         param = int(param * total)
12 |     return param
13 | 
14 | 
15 | class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
16 |     """
17 |     Learning rate scheduler with exponential warmup and step decay.
18 |     """
19 |     def __init__(self, optimizer, iterations, warmup_steps=0,
20 |                  remain_steps=1.0, decay_interval=None, decay_steps=4,
21 |                  decay_factor=0.5, last_epoch=-1):
22 |         """
23 |         Constructor of WarmupMultiStepLR.
24 | 
25 |         Parameters: warmup_steps, remain_steps and decay_interval accept both
26 |         integers and floats as an input. Integer input is interpreted as
27 |         absolute index of iteration, float input is interpreted as a fraction
28 |         of total training iterations (epochs * steps_per_epoch).
29 | 
30 |         If decay_interval is None then the decay will happen at regulary spaced
31 |         intervals ('decay_steps' decays between iteration indices
32 |         'remain_steps' and 'iterations').
33 | 
34 |         :param optimizer: instance of optimizer
35 |         :param iterations: total number of training iterations
36 |         :param warmup_steps: number of warmup iterations
37 |         :param remain_steps: start decay at 'remain_steps' iteration
38 |         :param decay_interval: interval between LR decay steps
39 |         :param decay_steps: max number of decay steps
40 |         :param decay_factor: decay factor
41 |         :param last_epoch: the index of last iteration
42 |         """
43 | 
44 |         # iterations before learning rate reaches base LR
45 |         self.warmup_steps = perhaps_convert_float(warmup_steps, iterations)
46 | 
47 |         # iteration at which decay starts
48 |         self.remain_steps = perhaps_convert_float(remain_steps, iterations)
49 | 
50 |         # number of steps between each decay
51 |         if decay_interval is None:
52 |             # decay at regulary spaced intervals
53 |             decay_iterations = iterations - self.remain_steps
54 |             self.decay_interval = decay_iterations // (decay_steps)
55 |             self.decay_interval = max(self.decay_interval, 1)
56 |         else:
57 |             self.decay_interval = perhaps_convert_float(decay_interval,
58 |                                                         iterations)
59 | 
60 |         # multiplicative decay factor
61 |         self.decay_factor = decay_factor
62 | 
63 |         # max number of decay steps
64 |         self.decay_steps = decay_steps
65 | 
66 |         if self.warmup_steps > self.remain_steps:
67 |             logging.warn(f'warmup_steps should not be larger than '
68 |                          f'remain_steps, setting warmup_steps=remain_steps')
69 |             self.warmup_steps = self.remain_steps
70 | 
71 |         super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch)
72 | 
73 |     def get_lr(self):
74 |         if self.last_epoch <= self.warmup_steps:
75 |             # exponential lr warmup
76 |             if self.warmup_steps != 0:
77 |                 warmup_factor = math.exp(math.log(0.01) / self.warmup_steps)
78 |             else:
79 |                 warmup_factor = 1.0
80 |             inv_decay = warmup_factor ** (self.warmup_steps - self.last_epoch)
81 |             lr = [base_lr * inv_decay for base_lr in self.base_lrs]
82 | 
83 |         elif self.last_epoch >= self.remain_steps:
84 |             # step decay
85 |             decay_iter = self.last_epoch - self.remain_steps
86 |             num_decay_steps = decay_iter // self.decay_interval + 1
87 |             num_decay_steps = min(num_decay_steps, self.decay_steps)
88 |             lr = [
89 |                 base_lr * (self.decay_factor ** num_decay_steps)
90 |                 for base_lr in self.base_lrs
91 |                 ]
92 |         else:
93 |             # base lr
94 |             lr = [base_lr for base_lr in self.base_lrs]
95 |         return lr
96 | 


--------------------------------------------------------------------------------
/experiments/gnmt/seq2seq/train/smoothing.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class LabelSmoothing(nn.Module):
 6 |     """
 7 |     NLL loss with label smoothing.
 8 |     """
 9 |     def __init__(self, padding_idx, smoothing=0.0):
10 |         """
11 |         Constructor for the LabelSmoothing module.
12 | 
13 |         :param padding_idx: index of the PAD token
14 |         :param smoothing: label smoothing factor
15 |         """
16 |         super(LabelSmoothing, self).__init__()
17 |         self.padding_idx = padding_idx
18 |         self.confidence = 1.0 - smoothing
19 |         self.smoothing = smoothing
20 | 
21 |     def forward(self, x, target):
22 |         logprobs = torch.nn.functional.log_softmax(x, dim=-1,
23 |                                                    dtype=torch.float32)
24 | 
25 |         non_pad_mask = (target != self.padding_idx)
26 |         nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
27 |         nll_loss = nll_loss.squeeze(1)[non_pad_mask]
28 |         smooth_loss = -logprobs.mean(dim=-1)[non_pad_mask]
29 |         loss = self.confidence * nll_loss + self.smoothing * smooth_loss
30 |         return loss.sum()
31 | 


--------------------------------------------------------------------------------
/experiments/inception/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) Soumith Chintala 2016, 
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/experiments/inception/README.md:
--------------------------------------------------------------------------------
1 | # Inception v3
2 | The code inside this direectory is adapted from the Inception v3 code in
3 | `torchvision` and is covered by the BSD 3-Clause License. See the LICENSE file
4 | in this directory for more information.
5 | 
6 | https://github.com/pytorch/vision/blob/master/torchvision/models/inception.py
7 | 
8 | 


--------------------------------------------------------------------------------
/experiments/inception/entry_point.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from . import inception
 5 | 
 6 | 
 7 | def skyline_model_provider():
 8 |     return inception.inception_v3(init_weights=False, aux_logits=False).cuda()
 9 | 
10 | 
11 | def skyline_input_provider(batch_size=16):
12 |     return (
13 |         torch.randn((batch_size, 3, 299, 299)).cuda(),
14 |         torch.randint(low=0, high=1000, size=(batch_size,)).cuda(),
15 |     )
16 | 
17 | 
18 | def skyline_iteration_provider(model):
19 |     optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
20 |     loss_fn = torch.nn.CrossEntropyLoss()
21 |     def iteration(*inputs):
22 |         data, labels = inputs
23 |         optimizer.zero_grad()
24 |         out = model(data)
25 |         out = loss_fn(out, labels)
26 |         out.backward()
27 |         optimizer.step()
28 |     return iteration
29 | 


--------------------------------------------------------------------------------
/experiments/process_raw_data.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | # Operate out of the script directory
 6 | SCRIPT_PATH=$(cd $(dirname $0) && pwd -P)
 7 | cd $SCRIPT_PATH
 8 | 
 9 | RESULTS_DIR="results/results-$(date "+%F_%H_%M")"
10 | 
11 | mkdir -p results
12 | mkdir $RESULTS_DIR
13 | mkdir $RESULTS_DIR/raw
14 | mkdir $RESULTS_DIR/ops
15 | mkdir $RESULTS_DIR/e2e
16 | mkdir $RESULTS_DIR/archives
17 | 
18 | for archive in $(ls *.tar.gz); do
19 |   tar xvzf $archive -C $RESULTS_DIR/raw
20 | done
21 | 
22 | python3 process_results.py \
23 |   --in-dir $RESULTS_DIR/raw \
24 |   --out-ops $RESULTS_DIR/ops \
25 |   --out-e2e $RESULTS_DIR/e2e
26 | 
27 | mv *.tar.gz $RESULTS_DIR/archives
28 | 
29 | 


--------------------------------------------------------------------------------
/experiments/resnet/LICENSE:
--------------------------------------------------------------------------------
 1 | NOTE: This license and disclaimer applies only to the "resnet.py" file in this
 2 | directory.
 3 | 
 4 | BSD 3-Clause License
 5 | 
 6 | Copyright (c) Soumith Chintala 2016,
 7 | All rights reserved.
 8 | 
 9 | Redistribution and use in source and binary forms, with or without
10 | modification, are permitted provided that the following conditions are met:
11 | 
12 | * Redistributions of source code must retain the above copyright notice, this
13 |   list of conditions and the following disclaimer.
14 | 
15 | * Redistributions in binary form must reproduce the above copyright notice,
16 |   this list of conditions and the following disclaimer in the documentation
17 |   and/or other materials provided with the distribution.
18 | 
19 | * Neither the name of the copyright holder nor the names of its
20 |   contributors may be used to endorse or promote products derived from
21 |   this software without specific prior written permission.
22 | 
23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
24 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
26 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
27 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 | 


--------------------------------------------------------------------------------
/experiments/resnet/README.md:
--------------------------------------------------------------------------------
1 | # ResNet
2 | The code inside this direectory is adapted from the ResNet code in `torchvision`
3 | and is covered by the BSD 3-Clause License. See the LICENSE file in this
4 | directory for more information.
5 | 
6 | https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
7 | 
8 | 


--------------------------------------------------------------------------------
/experiments/resnet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoffxy/habitat/5f01e523a1dc30dbfbaaa39cf4880a534c7781a2/experiments/resnet/__init__.py


--------------------------------------------------------------------------------
/experiments/resnet/entry_point.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from . import resnet
 5 | 
 6 | 
 7 | def skyline_model_provider():
 8 |     return resnet.resnet50().cuda()
 9 | 
10 | 
11 | def skyline_input_provider(batch_size=16):
12 |     return (
13 |         torch.randn((batch_size, 3, 224, 224)).cuda(),
14 |         torch.randint(low=0, high=1000, size=(batch_size,)).cuda(),
15 |     )
16 | 
17 | 
18 | def skyline_iteration_provider(model):
19 |     optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
20 |     def iteration(*inputs):
21 |         optimizer.zero_grad()
22 |         out = model(*inputs)
23 |         out.backward()
24 |         optimizer.step()
25 |     return iteration
26 | 


--------------------------------------------------------------------------------
/experiments/transformer/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Victor Huang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/experiments/transformer/README.md:
--------------------------------------------------------------------------------
 1 | Transformer Model (Attention is All You Need)
 2 | =============================================
 3 | The `tfmr` directory contains a PyTorch implementation of the Transformer model
 4 | described in the "[Attention is All You
 5 | Need](https://arxiv.org/abs/1706.03762)" paper. This code was adapted from
 6 | Yu-Hsiang Huang's implementation found in
 7 | [jadore801120/attention-is-all-you-need-pytorch](https://github.com/jadore801120/attention-is-all-you-need-pytorch).
 8 | 
 9 | License
10 | -------
11 | The code inside this directory is adapted from Yu-Hsiang Huang's implementation
12 | and therefore shares the same license. The unmodified license can be found in
13 | the `LICENSE` file.
14 | 


--------------------------------------------------------------------------------
/experiments/transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoffxy/habitat/5f01e523a1dc30dbfbaaa39cf4880a534c7781a2/experiments/transformer/__init__.py


--------------------------------------------------------------------------------
/experiments/transformer/tfmr/Beam.py:
--------------------------------------------------------------------------------
  1 | """ Manage beam search info structure.
  2 | 
  3 |     Heavily borrowed from OpenNMT-py.
  4 |     For code in OpenNMT-py, please check the following link:
  5 |     https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/Beam.py
  6 | """
  7 | 
  8 | import torch
  9 | import numpy as np
 10 | import transformer.tfmr.Constants as Constants
 11 | 
 12 | class Beam():
 13 |     ''' Beam search '''
 14 | 
 15 |     def __init__(self, size, device=False):
 16 | 
 17 |         self.size = size
 18 |         self._done = False
 19 | 
 20 |         # The score for each translation on the beam.
 21 |         self.scores = torch.zeros((size,), dtype=torch.float, device=device)
 22 |         self.all_scores = []
 23 | 
 24 |         # The backpointers at each time-step.
 25 |         self.prev_ks = []
 26 | 
 27 |         # The outputs at each time-step.
 28 |         self.next_ys = [torch.full((size,), Constants.PAD, dtype=torch.long, device=device)]
 29 |         self.next_ys[0][0] = Constants.BOS
 30 | 
 31 |     def get_current_state(self):
 32 |         "Get the outputs for the current timestep."
 33 |         return self.get_tentative_hypothesis()
 34 | 
 35 |     def get_current_origin(self):
 36 |         "Get the backpointers for the current timestep."
 37 |         return self.prev_ks[-1]
 38 | 
 39 |     @property
 40 |     def done(self):
 41 |         return self._done
 42 | 
 43 |     def advance(self, word_prob):
 44 |         "Update beam status and check if finished or not."
 45 |         num_words = word_prob.size(1)
 46 | 
 47 |         # Sum the previous scores.
 48 |         if len(self.prev_ks) > 0:
 49 |             beam_lk = word_prob + self.scores.unsqueeze(1).expand_as(word_prob)
 50 |         else:
 51 |             beam_lk = word_prob[0]
 52 | 
 53 |         flat_beam_lk = beam_lk.view(-1)
 54 | 
 55 |         best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, True) # 1st sort
 56 |         best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, True) # 2nd sort
 57 | 
 58 |         self.all_scores.append(self.scores)
 59 |         self.scores = best_scores
 60 | 
 61 |         # bestScoresId is flattened as a (beam x word) array,
 62 |         # so we need to calculate which word and beam each score came from
 63 |         prev_k = best_scores_id / num_words
 64 |         self.prev_ks.append(prev_k)
 65 |         self.next_ys.append(best_scores_id - prev_k * num_words)
 66 | 
 67 |         # End condition is when top-of-beam is EOS.
 68 |         if self.next_ys[-1][0].item() == Constants.EOS:
 69 |             self._done = True
 70 |             self.all_scores.append(self.scores)
 71 | 
 72 |         return self._done
 73 | 
 74 |     def sort_scores(self):
 75 |         "Sort the scores."
 76 |         return torch.sort(self.scores, 0, True)
 77 | 
 78 |     def get_the_best_score_and_idx(self):
 79 |         "Get the score of the best in the beam."
 80 |         scores, ids = self.sort_scores()
 81 |         return scores[1], ids[1]
 82 | 
 83 |     def get_tentative_hypothesis(self):
 84 |         "Get the decoded sequence for the current timestep."
 85 | 
 86 |         if len(self.next_ys) == 1:
 87 |             dec_seq = self.next_ys[0].unsqueeze(1)
 88 |         else:
 89 |             _, keys = self.sort_scores()
 90 |             hyps = [self.get_hypothesis(k) for k in keys]
 91 |             hyps = [[Constants.BOS] + h for h in hyps]
 92 |             dec_seq = torch.LongTensor(hyps)
 93 | 
 94 |         return dec_seq
 95 | 
 96 |     def get_hypothesis(self, k):
 97 |         """ Walk back to construct the full hypothesis. """
 98 |         hyp = []
 99 |         for j in range(len(self.prev_ks) - 1, -1, -1):
100 |             hyp.append(self.next_ys[j+1][k])
101 |             k = self.prev_ks[j][k]
102 | 
103 |         return list(map(lambda x: x.item(), hyp[::-1]))
104 | 


--------------------------------------------------------------------------------
/experiments/transformer/tfmr/Constants.py:
--------------------------------------------------------------------------------
 1 | 
 2 | PAD = 0
 3 | UNK = 1
 4 | BOS = 2
 5 | EOS = 3
 6 | 
 7 | PAD_WORD = '<blank>'
 8 | UNK_WORD = '<unk>'
 9 | BOS_WORD = '<s>'
10 | EOS_WORD = '</s>'
11 | 


--------------------------------------------------------------------------------
/experiments/transformer/tfmr/Layers.py:
--------------------------------------------------------------------------------
 1 | ''' Define the Layers '''
 2 | import torch.nn as nn
 3 | from transformer.tfmr.SubLayers import MultiHeadAttention, PositionwiseFeedForward
 4 | 
 5 | __author__ = "Yu-Hsiang Huang"
 6 | 
 7 | 
 8 | class EncoderLayer(nn.Module):
 9 |     ''' Compose with two layers '''
10 | 
11 |     def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
12 |         super(EncoderLayer, self).__init__()
13 |         self.slf_attn = MultiHeadAttention(
14 |             n_head, d_model, d_k, d_v, dropout=dropout)
15 |         self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
16 | 
17 |     def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
18 |         enc_output, enc_slf_attn = self.slf_attn(
19 |             enc_input, enc_input, enc_input, mask=slf_attn_mask)
20 |         enc_output *= non_pad_mask
21 | 
22 |         enc_output = self.pos_ffn(enc_output)
23 |         enc_output *= non_pad_mask
24 | 
25 |         return enc_output, enc_slf_attn
26 | 
27 | 
28 | class DecoderLayer(nn.Module):
29 |     ''' Compose with three layers '''
30 | 
31 |     def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
32 |         super(DecoderLayer, self).__init__()
33 |         self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
34 |         self.enc_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
35 |         self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
36 | 
37 |     def forward(self, dec_input, enc_output, non_pad_mask=None, slf_attn_mask=None, dec_enc_attn_mask=None):
38 |         dec_output, dec_slf_attn = self.slf_attn(
39 |             dec_input, dec_input, dec_input, mask=slf_attn_mask)
40 |         dec_output *= non_pad_mask
41 | 
42 |         dec_output, dec_enc_attn = self.enc_attn(
43 |             dec_output, enc_output, enc_output, mask=dec_enc_attn_mask)
44 |         dec_output *= non_pad_mask
45 | 
46 |         dec_output = self.pos_ffn(dec_output)
47 |         dec_output *= non_pad_mask
48 | 
49 |         return dec_output, dec_slf_attn, dec_enc_attn
50 | 


--------------------------------------------------------------------------------
/experiments/transformer/tfmr/Modules.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | 
 5 | __author__ = "Yu-Hsiang Huang"
 6 | 
 7 | class ScaledDotProductAttention(nn.Module):
 8 |     ''' Scaled Dot-Product Attention '''
 9 | 
10 |     def __init__(self, temperature, attn_dropout=0.1):
11 |         super().__init__()
12 |         self.temperature = temperature
13 |         self.dropout = nn.Dropout(attn_dropout)
14 |         self.softmax = nn.Softmax(dim=2)
15 | 
16 |     def forward(self, q, k, v, mask=None):
17 | 
18 |         attn = torch.bmm(q, k.transpose(1, 2))
19 |         attn = attn / self.temperature
20 | 
21 |         if mask is not None:
22 |             attn = attn.masked_fill(mask, -np.inf)
23 | 
24 |         attn = self.softmax(attn)
25 |         attn = self.dropout(attn)
26 |         output = torch.bmm(attn, v)
27 | 
28 |         return output, attn
29 | 


--------------------------------------------------------------------------------
/experiments/transformer/tfmr/Optim.py:
--------------------------------------------------------------------------------
 1 | '''A wrapper class for optimizer '''
 2 | import numpy as np
 3 | 
 4 | class ScheduledOptim():
 5 |     '''A simple wrapper class for learning rate scheduling'''
 6 | 
 7 |     def __init__(self, optimizer, d_model, n_warmup_steps):
 8 |         self._optimizer = optimizer
 9 |         self.n_warmup_steps = n_warmup_steps
10 |         self.n_current_steps = 0
11 |         self.init_lr = np.power(d_model, -0.5)
12 | 
13 |     def step_and_update_lr(self):
14 |         "Step with the inner optimizer"
15 |         self._update_learning_rate()
16 |         self._optimizer.step()
17 | 
18 |     def zero_grad(self):
19 |         "Zero out the gradients by the inner optimizer"
20 |         self._optimizer.zero_grad()
21 | 
22 |     def _get_lr_scale(self):
23 |         return np.min([
24 |             np.power(self.n_current_steps, -0.5),
25 |             np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])
26 | 
27 |     def _update_learning_rate(self):
28 |         ''' Learning rate scheduling per step '''
29 | 
30 |         self.n_current_steps += 1
31 |         lr = self.init_lr * self._get_lr_scale()
32 | 
33 |         for param_group in self._optimizer.param_groups:
34 |             param_group['lr'] = lr
35 | 
36 | 


--------------------------------------------------------------------------------
/experiments/transformer/tfmr/SubLayers.py:
--------------------------------------------------------------------------------
 1 | ''' Define the sublayers in encoder/decoder layer '''
 2 | import numpy as np
 3 | import torch.nn as nn
 4 | import torch.nn.functional as F
 5 | from transformer.tfmr.Modules import ScaledDotProductAttention
 6 | 
 7 | __author__ = "Yu-Hsiang Huang"
 8 | 
 9 | class MultiHeadAttention(nn.Module):
10 |     ''' Multi-Head Attention module '''
11 | 
12 |     def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
13 |         super().__init__()
14 | 
15 |         self.n_head = n_head
16 |         self.d_k = d_k
17 |         self.d_v = d_v
18 | 
19 |         self.w_qs = nn.Linear(d_model, n_head * d_k)
20 |         self.w_ks = nn.Linear(d_model, n_head * d_k)
21 |         self.w_vs = nn.Linear(d_model, n_head * d_v)
22 |         nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
23 |         nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
24 |         nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))
25 | 
26 |         self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5))
27 |         self.layer_norm = nn.LayerNorm(d_model)
28 | 
29 |         self.fc = nn.Linear(n_head * d_v, d_model)
30 |         nn.init.xavier_normal_(self.fc.weight)
31 | 
32 |         self.dropout = nn.Dropout(dropout)
33 | 
34 | 
35 |     def forward(self, q, k, v, mask=None):
36 | 
37 |         d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
38 | 
39 |         sz_b, len_q, _ = q.size()
40 |         sz_b, len_k, _ = k.size()
41 |         sz_b, len_v, _ = v.size()
42 | 
43 |         residual = q
44 | 
45 |         q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
46 |         k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
47 |         v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
48 | 
49 |         q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk
50 |         k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk
51 |         v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv
52 | 
53 |         mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x ..
54 |         output, attn = self.attention(q, k, v, mask=mask)
55 | 
56 |         output = output.view(n_head, sz_b, len_q, d_v)
57 |         output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv)
58 | 
59 |         output = self.dropout(self.fc(output))
60 |         output = self.layer_norm(output + residual)
61 | 
62 |         return output, attn
63 | 
64 | class PositionwiseFeedForward(nn.Module):
65 |     ''' A two-feed-forward-layer module '''
66 | 
67 |     def __init__(self, d_in, d_hid, dropout=0.1):
68 |         super().__init__()
69 |         self.w_1 = nn.Conv1d(d_in, d_hid, 1) # position-wise
70 |         self.w_2 = nn.Conv1d(d_hid, d_in, 1) # position-wise
71 |         self.layer_norm = nn.LayerNorm(d_in)
72 |         self.dropout = nn.Dropout(dropout)
73 | 
74 |     def forward(self, x):
75 |         residual = x
76 |         output = x.transpose(1, 2)
77 |         output = self.w_2(F.relu(self.w_1(output)))
78 |         output = output.transpose(1, 2)
79 |         output = self.dropout(output)
80 |         output = self.layer_norm(output + residual)
81 |         return output
82 | 


--------------------------------------------------------------------------------
/experiments/transformer/tfmr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geoffxy/habitat/5f01e523a1dc30dbfbaaa39cf4880a534c7781a2/experiments/transformer/tfmr/__init__.py


--------------------------------------------------------------------------------
/tools/device-metadata/README.md:
--------------------------------------------------------------------------------
1 | ## Peak Performance
2 | 
3 | Use `measure_peak_flops.py` to measure the peak performance (GFLOP/s) on a
4 | given GPU. Note that you need to run this script inside the Habitat container
5 | (or otherwise install the habitat-predictor Python package, which is located
6 | inside the analyzer top-level directory).
7 | 


--------------------------------------------------------------------------------
/tools/device-metadata/measure_peak_flops.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import statistics
 3 | 
 4 | import habitat
 5 | import habitat.habitat_cuda as hc
 6 | from habitat.analysis.metrics import Metric
 7 | from habitat.profiling.kernel import KernelProfiler
 8 | 
 9 | 
10 | def measure_peak_flops(profiler):
11 |     results = profiler.measure_kernels(hc._diagnostics.run_flop_test)
12 |     assert len(results) == 1
13 |     kernel = results[0]
14 |     gflops_per_second = (
15 |         kernel.get_metric(Metric.SinglePrecisionAddOps) / kernel.run_time_ns
16 |     )
17 |     efficiency = kernel.get_metric(Metric.SinglePrecisionFLOPEfficiency) / 100
18 |     return gflops_per_second / efficiency
19 | 
20 | 
21 | def main():
22 |     parser = argparse.ArgumentParser(
23 |         description="Measure the peak performance (FLOP/s) of a GPU."
24 |     )
25 |     parser.add_argument("device", help="The current device (e.g., RTX2070).")
26 |     parser.add_argument("--trials", type=int, default=5)
27 |     args = parser.parse_args()
28 | 
29 |     profiler = KernelProfiler(
30 |         getattr(habitat.Device, args.device),
31 |         metrics=[
32 |             Metric.SinglePrecisionFLOPEfficiency,
33 |             Metric.SinglePrecisionAddOps,
34 |         ],
35 |     )
36 | 
37 |     results = []
38 |     for trial in range(args.trials):
39 |         print("Running trial {}...".format(trial))
40 |         results.append(measure_peak_flops(profiler))
41 | 
42 |     print("Peak Performance on the {}".format(args.device))
43 |     print("===============================")
44 |     print("Median: {} GFLOP/s".format(statistics.median(results)))
45 |     print("Mean:   {} GFLOP/s".format(statistics.mean(results)))
46 |     print("Max.:   {} GFLOP/s".format(max(results)))
47 |     print("Min.:   {} GFLOP/s".format(min(results)))
48 |     print("Trials: {}".format(args.trials))
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     main()
53 | 


--------------------------------------------------------------------------------
/tools/kernel-metadata/extract.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | function usage() {
 4 |   echo "Usage: $0 <output file name> path/to/libtorch.so"
 5 |   exit 1
 6 | }
 7 | 
 8 | if [ -z "$1" ] || [ -z "$2" ]; then
 9 |   usage "$@"
10 | fi
11 | 
12 | DATABASE_NAME=$1
13 | LIBTORCH_PATH=$2
14 | 
15 | declare -a SHARED_LIBS=(
16 |   $(ldd $LIBTORCH_PATH | grep -E -o "/\S+")
17 |   "$LIBTORCH_PATH"
18 | )
19 | 
20 | for shared_lib in ${SHARED_LIBS[@]}; do
21 |   echo "Processing $shared_lib"
22 |   cuobjdump -res-usage $shared_lib 2> /dev/null | \
23 |     python3 process-cuobjdump-output.py --database $DATABASE_NAME
24 | done
25 | 
26 | echo "Done!"
27 | 


--------------------------------------------------------------------------------
/tools/kernel-metadata/process-cuobjdump-output.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import sqlite3
  3 | import sys
  4 | import re
  5 | 
  6 | ARCH_LINE_REGEX = re.compile('^arch = sm_(?P<arch>[0-9]+)$')
  7 | FUNC_LINE_REGEX = re.compile('^\sFunction\s(?P<name>.+):$')
  8 | RES_LINE_REGEX = re.compile('^\s\sREG:(?P<registers>[0-9]+)\s.*$')
  9 | 
 10 | 
 11 | class Parser:
 12 |     """
 13 |     Parses cuobjdump output that uses the -res-usage flag.
 14 | 
 15 |     This parser is implemented using a coroutine. Use the consume() method to
 16 |     send input lines to the parser. The consume() method returns a parsed
 17 |     kernel or None (when more input is required).
 18 |     """
 19 |     def __init__(self):
 20 |         self._impl = self._parser_coroutine()
 21 |         next(self._impl)
 22 | 
 23 |     def consume(self, line):
 24 |         result = self._impl.send(line)
 25 |         if result is not None:
 26 |             next(self._impl)
 27 |         return result
 28 | 
 29 |     def _parser_coroutine(self):
 30 |         arch = None
 31 | 
 32 |         while True:
 33 |             line = (yield)[:-1]
 34 | 
 35 |             arch_match = ARCH_LINE_REGEX.match(line)
 36 |             if arch_match is not None:
 37 |                 arch = int(arch_match.group('arch'))
 38 |                 continue
 39 | 
 40 |             func_line_match = FUNC_LINE_REGEX.match(line)
 41 |             if func_line_match is None:
 42 |                 continue
 43 | 
 44 |             # When we find a function, we expect the next line to be its
 45 |             # corresponding resource string
 46 |             func_name = func_line_match.group('name')
 47 | 
 48 |             res_line = (yield)[:-1]
 49 | 
 50 |             resource_match = RES_LINE_REGEX.match(res_line)
 51 |             if resource_match is None:
 52 |                 raise AssertionError(
 53 |                     'Missing resource information for function: ' + func_name)
 54 | 
 55 |             registers_per_thread = int(resource_match.group('registers'))
 56 |             yield (func_name, arch, registers_per_thread)
 57 | 
 58 | 
 59 | def ensure_tables_exist(connection):
 60 |     create_table = """
 61 |     CREATE TABLE IF NOT EXISTS kernels (
 62 |       name TEXT NOT NULL,
 63 |       arch INT NOT NULL,
 64 |       registers_per_thread INT NOT NULL,
 65 |       PRIMARY KEY (name, arch)
 66 |     )
 67 |     """
 68 |     cursor = connection.cursor()
 69 |     cursor.execute(create_table)
 70 |     connection.commit()
 71 | 
 72 | 
 73 | def insert_kernel(connection, name, arch, registers_per_thread):
 74 |     query = """
 75 |     INSERT INTO kernels (name, arch, registers_per_thread) VALUES (?, ?, ?)
 76 |     """
 77 |     cursor = connection.cursor()
 78 |     cursor.execute(query, (name, arch, registers_per_thread))
 79 | 
 80 | 
 81 | def process_cuobjdump_output(connection):
 82 |     parser = Parser()
 83 | 
 84 |     for line in iter(sys.stdin.readline, ''):
 85 |         kernel_info = parser.consume(line)
 86 |         if kernel_info is None:
 87 |             continue
 88 | 
 89 |         try:
 90 |             insert_kernel(connection, *kernel_info)
 91 |         except sqlite3.IntegrityError:
 92 |             # cuobjdump duplicates kernel entries - skip them for now
 93 |             pass
 94 | 
 95 | 
 96 | def main():
 97 |     parser = argparse.ArgumentParser()
 98 |     parser.add_argument('--database', type=str, required=True)
 99 |     args = parser.parse_args()
100 | 
101 |     connection = sqlite3.connect(args.database)
102 |     ensure_tables_exist(connection)
103 |     process_cuobjdump_output(connection)
104 |     connection.commit()
105 |     connection.close()
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     main()
110 | 


--------------------------------------------------------------------------------
/tools/recording/database.py:
--------------------------------------------------------------------------------
  1 | import sqlite3
  2 | import logging
  3 | 
  4 | logger = logging.getLogger(__name__)
  5 | 
  6 | FEATURES_TEMPLATE = '{feature} INTEGER NOT NULL,'
  7 | 
  8 | 
  9 | class Recorder:
 10 |     def __init__(self, file_name, features):
 11 |         self._file_name = file_name
 12 |         self._features = features
 13 |         self._strings = {}
 14 |         self._connection, self._cursor = self._initialize()
 15 | 
 16 |     def _generate_queries(self):
 17 |         features_sql = ''.join(map(
 18 |             lambda f: FEATURES_TEMPLATE.format(feature=f),
 19 |             self._features,
 20 |         ))
 21 |         self._create_recordings = """
 22 |           CREATE TABLE IF NOT EXISTS recordings (
 23 |             id INTEGER PRIMARY KEY,
 24 |             {features}
 25 |             is_forward INTEGER NOT NULL,
 26 |             run_time_ms REAL NOT NULL
 27 |           )
 28 |         """.format(features=features_sql)
 29 |         self._insert_recording = """
 30 |           INSERT INTO recordings (
 31 |             {features},
 32 |             is_forward,
 33 |             run_time_ms
 34 |           )
 35 |           VALUES ({values} ?, ?)
 36 |         """.format(
 37 |             features=','.join(self._features),
 38 |             values='?,' * len(self._features),
 39 |         )
 40 | 
 41 |     def _initialize(self):
 42 |         self._generate_queries()
 43 |         connection = sqlite3.connect(self._file_name)
 44 |         cursor = connection.cursor()
 45 |         cursor.execute(self._create_recordings)
 46 |         cursor.execute("""
 47 |           CREATE TABLE IF NOT EXISTS kernels (
 48 |             id INTEGER PRIMARY KEY,
 49 |             recording_id INTEGER NOT NULL,
 50 |             kernel_name INTEGER NOT NULL,
 51 |             run_time_ns INTEGER NOT NULL
 52 |           )
 53 |         """)
 54 |         cursor.execute("""
 55 |           CREATE TABLE IF NOT EXISTS strings (
 56 |             id INTEGER PRIMARY KEY,
 57 |             value TEXT NOT NULL
 58 |           )
 59 |         """)
 60 |         connection.commit()
 61 |         return connection, cursor
 62 | 
 63 |     def get_num_recordings(self):
 64 |         self._cursor.execute("SELECT COUNT(*) FROM recordings")
 65 |         return self._cursor.fetchone()[0]
 66 | 
 67 |     def record(self, config, is_forward, run_time_ms, recorded_kernels):
 68 |         try:
 69 |             self._cursor.execute(
 70 |                 self._insert_recording,
 71 |                 (*tuple(map(int, config)), int(is_forward), run_time_ms),
 72 |             )
 73 |             recording_id = self._cursor.lastrowid
 74 |             for kernel in recorded_kernels:
 75 |                 if kernel.name in self._strings:
 76 |                     kernel_name = self._strings[kernel.name]
 77 |                 else:
 78 |                     self._cursor.execute(Recorder.insert_string, (kernel.name,))
 79 |                     kernel_name = self._cursor.lastrowid
 80 |                     self._strings[kernel.name] = kernel_name
 81 | 
 82 |                 self._cursor.execute(
 83 |                     Recorder.insert_kernel,
 84 |                     (recording_id, kernel_name, kernel.run_time_ns)
 85 |                 )
 86 |         except OverflowError:
 87 |             logger.warn(
 88 |                 'Could not record a kernel because its run time overflowed the'
 89 |                 'SQLite integer datatype.'
 90 |             )
 91 | 
 92 |     def commit(self):
 93 |         self._connection.commit()
 94 | 
 95 |     def __del__(self):
 96 |         self._connection.commit()
 97 |         self._connection.close()
 98 | 
 99 | 
100 | Recorder.insert_kernel = """
101 |   INSERT INTO kernels (recording_id, kernel_name, run_time_ns) VALUES (?, ?, ?)
102 | """
103 | 
104 | Recorder.insert_string = """
105 |   INSERT INTO strings (value) VALUES (?)
106 | """
107 | 


--------------------------------------------------------------------------------
/tools/recording/features.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | conv2d = [
 4 |     'bias',
 5 |     'batch',
 6 |     'image_size',
 7 |     'in_channels',
 8 |     'out_channels',
 9 |     'kernel_size',
10 |     'stride',
11 |     'padding',
12 | ]
13 | 
14 | bmm = [
15 |     'batch',
16 |     # (batch, left, middle) x (batch, middle, right)
17 |     'left',
18 |     'middle',
19 |     'right',
20 | ]
21 | 
22 | lstm = [
23 |     'bias', # 0 or 1, represents the bias flag
24 |     'bidirectional', # 0 or 1, represents the bidirectional flag
25 |     'batch',
26 |     'seq_len',
27 |     'input_size',
28 |     'hidden_size',
29 |     'num_layers',
30 | ]
31 | 
32 | linear = [
33 |     'bias',
34 |     'batch',
35 |     'in_features',
36 |     'out_features',
37 | ]
38 | 
39 | FEATURES = {
40 |     'bmm': bmm,
41 |     'conv2d': conv2d,
42 |     'linear': linear,
43 |     'lstm': lstm,
44 | }
45 | 


--------------------------------------------------------------------------------
/tools/recording/record_bmm.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | 
 4 | import torch
 5 | from record_common import Measurer
 6 | import features as f
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | def index_to_config(args, index):
12 |     batch = (index % args.batches) + 1
13 |     index //= args.batches
14 | 
15 |     left = (index % args.left) + 1
16 |     index //= args.left
17 | 
18 |     middle = (index % args.middle) + 1
19 |     index //= args.middle
20 | 
21 |     right = index + 1
22 | 
23 |     return (
24 |         batch,
25 |         left,
26 |         middle,
27 |         right,
28 |     )
29 | 
30 | 
31 | def config_to_profiler_args(config):
32 |     (batch, left, middle, right) = config
33 |     o1 = torch.randn((batch, left, middle)).cuda()
34 |     o2 = torch.randn((batch, middle, right)).cuda()
35 |     o1.requires_grad_()
36 |     o2.requires_grad_()
37 |     return {
38 |         'func': torch.bmm,
39 |         'args': (o1, o2),
40 |         'kwargs': {},
41 |     }
42 | 
43 | 
44 | def main():
45 |     measurer = Measurer(
46 |         op_name='bmm',
47 |         recorder_config=f.bmm,
48 |         index_to_config=index_to_config,
49 |         config_to_profiler_args=config_to_profiler_args,
50 |     )
51 |     parser = argparse.ArgumentParser()
52 |     measurer.add_args(parser)
53 |     parser.add_argument('--batches', type=int, default=128)
54 |     parser.add_argument('--left', type=int, default=1024)
55 |     parser.add_argument('--middle', type=int, default=1024)
56 |     parser.add_argument('--right', type=int, default=1024)
57 |     args = parser.parse_args()
58 | 
59 |     num_configs = (
60 |         args.batches *
61 |         args.left *
62 |         args.middle *
63 |         args.right
64 |     )
65 |     measurer.measure_configurations(args, num_configs)
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     kwargs = {
70 |         "format": "%(asctime)s %(levelname)-8s %(message)s",
71 |         "datefmt": "%Y-%m-%d %H:%M",
72 |         "level": logging.INFO,
73 |     }
74 |     logging.basicConfig(**kwargs)
75 |     main()
76 | 


--------------------------------------------------------------------------------
/tools/recording/record_conv2d.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import math
  4 | 
  5 | import torch
  6 | from record_common import Measurer
  7 | import features as f
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | MIN_IN_CHANNELS = 3
 12 | MIN_OUT_CHANNELS = 16
 13 | 
 14 | torch.backends.cudnn.benchmark = True
 15 | 
 16 | 
 17 | def index_to_config(args, index):
 18 |     bias = False if index % 2 == 0 else True
 19 |     index //= 2
 20 | 
 21 |     batch = (index % args.batches) + 1
 22 |     index //= args.batches
 23 | 
 24 |     image_size = (index % args.image_size) + 1
 25 |     index //= args.image_size
 26 | 
 27 |     in_channels = (index % args.in_channels) + 1
 28 |     index //= args.in_channels
 29 | 
 30 |     out_channels = (index % args.out_channels) + 1
 31 |     index //= args.out_channels
 32 | 
 33 |     kernel_size = (index % args.kernel_size) + 1
 34 |     index //= args.kernel_size
 35 | 
 36 |     stride = (index % args.stride) + 1
 37 |     index //= args.stride
 38 | 
 39 |     # Padding is 0-based
 40 |     padding = index
 41 | 
 42 |     return (
 43 |         bias,
 44 |         batch,
 45 |         image_size,
 46 |         in_channels,
 47 |         out_channels,
 48 |         kernel_size,
 49 |         stride,
 50 |         padding,
 51 |     )
 52 | 
 53 | 
 54 | def index_filter(args, index):
 55 |     config = index_to_config(args, index)
 56 |     # NOTE: We multiply because the dimensions have different ranges; we want
 57 |     #       them to each "contribute equally". We weigh the image size more to
 58 |     #       select smaller image sizes.
 59 |     # image_size (1-dim) * in_channels * out_channels * kernel_size
 60 |     conv_size = math.pow(config[2], 1.15) * config[3] * config[4] * config[5]
 61 | 
 62 |     # NOTE: This value was chosen arbitrarily: we don't want the in/out
 63 |     #       channels and image size to all be too large. This way, large values
 64 |     #       for the in/out channels would lead to a smaller image size (and
 65 |     #       vice versa).
 66 |     return conv_size <= 35000000
 67 | 
 68 | 
 69 | def config_to_profiler_args(config):
 70 |     (bias,
 71 |      batch,
 72 |      image_size,
 73 |      in_channels,
 74 |      out_channels,
 75 |      kernel_size,
 76 |      stride,
 77 |      padding) = config
 78 | 
 79 |     # Easiest way to exclude certain sample configurations
 80 |     if in_channels < MIN_IN_CHANNELS or out_channels < MIN_OUT_CHANNELS:
 81 |         return None
 82 | 
 83 |     device = torch.device('cuda')
 84 |     conv2d = torch.nn.Conv2d(
 85 |         in_channels=in_channels,
 86 |         out_channels=out_channels,
 87 |         kernel_size=kernel_size,
 88 |         stride=stride,
 89 |         padding=padding,
 90 |         bias=bias,
 91 |     ).to(device)
 92 |     inp = torch.randn((
 93 |         batch,
 94 |         in_channels,
 95 |         image_size,
 96 |         image_size,
 97 |     ), device=device)
 98 |     # NOTE: This is important: for most convolutions, we will also need the
 99 |     #       gradient with respect to the input to be able to backpropagate to
100 |     #       earlier operations in the network.
101 |     inp = inp.requires_grad_()
102 | 
103 |     return {
104 |         'func': conv2d,
105 |         'args': (inp,),
106 |         'kwargs': {},
107 |     }
108 | 
109 | 
110 | def main():
111 |     measurer = Measurer(
112 |         op_name='conv2d',
113 |         recorder_config=f.conv2d,
114 |         index_to_config=index_to_config,
115 |         index_filter=index_filter,
116 |         config_to_profiler_args=config_to_profiler_args,
117 |     )
118 | 
119 |     parser = argparse.ArgumentParser()
120 |     measurer.add_args(parser)
121 |     parser.add_argument('--batches', type=int, default=64)
122 |     parser.add_argument('--image-size', type=int, default=256)
123 |     parser.add_argument('--in-channels', type=int, default=2048)
124 |     parser.add_argument('--out-channels', type=int, default=2048)
125 |     parser.add_argument('--kernel-size', type=int, default=11)
126 |     parser.add_argument('--stride', type=int, default=4)
127 |     # Padding is 0-based, so this means we consider 0 to 3 inclusive
128 |     parser.add_argument('--padding', type=int, default=4)
129 |     args = parser.parse_args()
130 | 
131 |     num_configs = (
132 |         2 * # Whether or not there is a bias
133 |         args.batches *
134 |         args.image_size *
135 |         args.in_channels *
136 |         args.out_channels *
137 |         args.kernel_size *
138 |         args.stride *
139 |         args.padding
140 |     )
141 | 
142 |     # Conv2d has filtering, so we won't have exactly 200000 points (the
143 |     # default). So here we increase the number of starting points.
144 |     if args.num_points == 200000:
145 |         args.num_points *= 6
146 | 
147 |     measurer.measure_configurations(args, num_configs)
148 | 
149 | 
150 | if __name__ == '__main__':
151 |     kwargs = {
152 |         "format": "%(asctime)s %(levelname)-8s %(message)s",
153 |         "datefmt": "%Y-%m-%d %H:%M",
154 |         "level": logging.INFO,
155 |     }
156 |     logging.basicConfig(**kwargs)
157 |     main()
158 | 


--------------------------------------------------------------------------------
/tools/recording/record_linear.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | 
 4 | import torch
 5 | from record_common import Measurer
 6 | import features as f
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | def index_to_config(args, index):
12 |     bias = False if index % 2 == 0 else True
13 |     index //= 2
14 | 
15 |     batch = (index % args.batches) + 1
16 |     index //= args.batches
17 | 
18 |     in_features = (index % args.in_features) + 1
19 |     index //= args.in_features
20 | 
21 |     out_features = index + 1
22 | 
23 |     return (
24 |         bias,
25 |         batch,
26 |         in_features,
27 |         out_features,
28 |     )
29 | 
30 | 
31 | def config_to_profiler_args(config):
32 |     (bias, batch, in_features, out_features) = config
33 |     linear = torch.nn.Linear(
34 |         in_features=in_features, out_features=out_features, bias=bias).cuda()
35 |     inp = torch.randn((batch, in_features)).cuda()
36 |     # NOTE: This is important: for most linear layers, we will also need the
37 |     #       gradient with respect to the input to be able to backpropagate to
38 |     #       earlier operations in the network.
39 |     inp = inp.requires_grad_()
40 |     return {
41 |         'func': linear,
42 |         'args': (inp,),
43 |         'kwargs': {},
44 |     }
45 | 
46 | 
47 | def index_filter(args, index):
48 |     config = index_to_config(args, index)
49 |     # NOTE: We multiply because the dimensions have different ranges; we want
50 |     #       them to each "contribute equally". We weigh the image size more to
51 |     #       select smaller image sizes.
52 |     # batch * in_features * out_features
53 |     linear_size = config[1] * config[2] * config[3]
54 | 
55 |     # NOTE: This value was chosen arbitrarily: we don't want the in/out
56 |     #       features and batch to all be too large. This way, large values
57 |     #       for the in/out features would lead to a smaller batch size (and
58 |     #       vice versa).
59 |     return linear_size <= 840000000
60 | 
61 | 
62 | def main():
63 |     measurer = Measurer(
64 |         op_name='linear',
65 |         recorder_config=f.linear,
66 |         index_to_config=index_to_config,
67 |         index_filter=index_filter,
68 |         config_to_profiler_args=config_to_profiler_args,
69 |     )
70 |     parser = argparse.ArgumentParser()
71 |     measurer.add_args(parser)
72 |     parser.add_argument('--batches', type=int, default=3500)
73 |     parser.add_argument('--in-features', type=int, default=32768)
74 |     parser.add_argument('--out-features', type=int, default=32768)
75 |     args = parser.parse_args()
76 | 
77 |     # Linear has filtering, so we won't have exactly 200000 points (the
78 |     # default). So here we increase the number of starting points.
79 |     if args.num_points == 200000:
80 |         args.num_points *= 80
81 | 
82 |     num_configs = (
83 |         2 * # Whether or not there is a bias
84 |         args.batches *
85 |         args.in_features *
86 |         args.out_features
87 |     )
88 |     measurer.measure_configurations(args, num_configs)
89 | 
90 | 
91 | if __name__ == '__main__':
92 |     kwargs = {
93 |         "format": "%(asctime)s %(levelname)-8s %(message)s",
94 |         "datefmt": "%Y-%m-%d %H:%M",
95 |         "level": logging.INFO,
96 |     }
97 |     logging.basicConfig(**kwargs)
98 |     main()
99 | 


--------------------------------------------------------------------------------
/tools/recording/record_lstm.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | 
  4 | import torch
  5 | from record_common import Measurer
  6 | import features as f
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | def index_to_config(args, index):
 12 |     bias = index % 2
 13 |     index //= 2
 14 | 
 15 |     bidirectional = index % 2
 16 |     index //= 2
 17 | 
 18 |     batch = (index % args.batches) + 1
 19 |     index //= args.batches
 20 | 
 21 |     seq_len = (index % args.seq_len) + 1
 22 |     index //= args.seq_len
 23 | 
 24 |     input_size = (index % args.input_size) + 1
 25 |     index //= args.input_size
 26 | 
 27 |     hidden_size = (index % args.hidden_size) + 1
 28 |     index //= args.hidden_size
 29 | 
 30 |     num_layers = index + 1
 31 | 
 32 |     return (
 33 |         bias,
 34 |         bidirectional,
 35 |         batch,
 36 |         seq_len,
 37 |         input_size,
 38 |         hidden_size,
 39 |         num_layers,
 40 |     )
 41 | 
 42 | 
 43 | def config_to_profiler_args(config):
 44 |     (bias,
 45 |      bidirectional,
 46 |      batch,
 47 |      seq_len,
 48 |      input_size,
 49 |      hidden_size,
 50 |      num_layers) = config
 51 |     inputs = torch.randn((seq_len, batch, input_size)).cuda()
 52 |     lstm = torch.nn.LSTM(
 53 |         input_size=input_size,
 54 |         hidden_size=hidden_size,
 55 |         num_layers=num_layers,
 56 |         bias=bool(bias),
 57 |         bidirectional=bool(bidirectional),
 58 |     ).cuda()
 59 |     # NOTE: This is important: for most LSTMs, we will also need the gradient
 60 |     #       with respect to the input to be able to backpropagate to earlier
 61 |     #       operations in the network.
 62 |     inputs = inputs.requires_grad_()
 63 |     return {
 64 |         'func': lstm,
 65 |         'args': (inputs,),
 66 |         'kwargs': {},
 67 |     }
 68 | 
 69 | 
 70 | def main():
 71 |     measurer = Measurer(
 72 |         op_name='lstm',
 73 |         recorder_config=f.lstm,
 74 |         index_to_config=index_to_config,
 75 |         config_to_profiler_args=config_to_profiler_args,
 76 |     )
 77 |     parser = argparse.ArgumentParser()
 78 |     measurer.add_args(parser)
 79 |     parser.add_argument('--batches', type=int, default=128)
 80 |     parser.add_argument('--seq-len', type=int, default=64)
 81 |     parser.add_argument('--input-size', type=int, default=1280)
 82 |     parser.add_argument('--hidden-size', type=int, default=1280)
 83 |     parser.add_argument('--num-layers', type=int, default=6)
 84 |     args = parser.parse_args()
 85 | 
 86 |     num_configs = (
 87 |         2 * # bias
 88 |         2 * # bidirectional
 89 |         args.batches *
 90 |         args.seq_len *
 91 |         args.input_size *
 92 |         args.hidden_size *
 93 |         args.num_layers
 94 |     )
 95 |     measurer.measure_configurations(args, num_configs)
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     kwargs = {
100 |         "format": "%(asctime)s %(levelname)-8s %(message)s",
101 |         "datefmt": "%Y-%m-%d %H:%M",
102 |         "level": logging.INFO,
103 |     }
104 |     logging.basicConfig(**kwargs)
105 |     main()
106 | 


--------------------------------------------------------------------------------