├── .gitignore ├── .gitmodules ├── CITATION.cff ├── LICENSE ├── NOTICE ├── README.md ├── analyzer ├── .gitignore ├── README.md ├── extract-models.sh ├── habitat │ ├── __init__.py │ ├── analysis │ │ ├── __init__.py │ │ ├── arguments.py │ │ ├── device.py │ │ ├── kernels.py │ │ ├── metrics.py │ │ ├── mlp │ │ │ ├── .gitignore │ │ │ ├── __init__.py │ │ │ ├── dataset.py │ │ │ ├── dataset_process.py │ │ │ ├── devices.csv │ │ │ ├── devices.py │ │ │ ├── mlp.py │ │ │ ├── saved_models │ │ │ │ └── .gitignore │ │ │ └── train.py │ │ ├── operation.py │ │ ├── predictor.py │ │ ├── run_time.py │ │ ├── trace.py │ │ └── wave_scaling │ │ │ ├── __init__.py │ │ │ ├── common.py │ │ │ ├── metadata.py │ │ │ ├── resimplified.py │ │ │ ├── roofline.py │ │ │ └── unified.py │ ├── data │ │ ├── __init__.py │ │ ├── checksums │ │ ├── devices.yml │ │ └── verify.sh │ ├── profiling │ │ ├── __init__.py │ │ ├── autograd.py │ │ ├── backward.py │ │ ├── kernel.py │ │ ├── operation.py │ │ └── run_time.py │ ├── tracking │ │ ├── __init__.py │ │ ├── base.py │ │ ├── callable.py │ │ ├── hook_manager.py │ │ └── operation.py │ └── utils.py ├── install-dev.sh ├── pyproject.toml └── setup.py ├── cpp ├── .gitignore ├── CMakeLists.txt ├── README.md ├── cmake │ ├── FindCUPTI.cmake │ └── FindNVPerf.cmake ├── external │ ├── CMakeLists.txt │ └── cupti_profilerhost_util │ │ ├── CMakeLists.txt │ │ ├── include │ │ ├── c_util │ │ │ ├── FileOp.h │ │ │ └── ScopeExit.h │ │ └── profilerhost_util │ │ │ ├── Eval.h │ │ │ ├── List.h │ │ │ ├── Metric.h │ │ │ └── Parser.h │ │ └── src │ │ └── profilerhost_util │ │ ├── Eval.cpp │ │ ├── List.cpp │ │ └── Metric.cpp └── src │ ├── CMakeLists.txt │ ├── cuda │ ├── CMakeLists.txt │ ├── cuda_macros.h │ ├── cuda_occupancy.h │ ├── cupti_exceptions.cpp │ ├── cupti_exceptions.h │ ├── cupti_macros.h │ ├── cupti_manager.cpp │ ├── cupti_profiler.cpp │ ├── cupti_profiler.h │ ├── cupti_tracer.cpp │ ├── diagnostics.cu │ ├── diagnostics.h │ ├── habitat_cupti.h │ ├── kernel.cpp │ ├── kernel.h │ ├── legacy_cupti_profiler.cpp │ ├── legacy_cupti_profiler.h │ ├── metrics.h │ ├── new_cupti_profiler.cpp │ ├── new_cupti_profiler.h │ ├── sampled_measurement.h │ ├── utils-inl.h │ └── utils.h │ ├── device_info.cpp │ ├── frontend │ ├── CMakeLists.txt │ ├── model_bindings.cpp │ ├── model_bindings.h │ ├── profiler.cpp │ └── profiler.h │ └── habitat_cuda.cpp ├── docker ├── Dockerfile ├── README.md ├── create-user.sh ├── setup.sh ├── start.sh └── vars.sh ├── experiments ├── .gitignore ├── dcgan │ ├── LICENSE │ ├── README.md │ ├── dcgan.py │ └── entry_point.py ├── gather_raw_data.sh ├── gnmt │ ├── README.md │ ├── __init__.py │ ├── entry_point.py │ └── seq2seq │ │ ├── LICENSE │ │ ├── data │ │ ├── config.py │ │ ├── dataset.py │ │ ├── sampler.py │ │ └── tokenizer.py │ │ ├── inference │ │ ├── beam_search.py │ │ └── inference.py │ │ ├── models │ │ ├── attention.py │ │ ├── decoder.py │ │ ├── encoder.py │ │ ├── gnmt.py │ │ └── seq2seq_base.py │ │ ├── train │ │ ├── fp_optimizers.py │ │ ├── lr_scheduler.py │ │ ├── smoothing.py │ │ └── trainer.py │ │ └── utils.py ├── inception │ ├── LICENSE │ ├── README.md │ ├── entry_point.py │ └── inception.py ├── process_raw_data.sh ├── process_results.py ├── resnet │ ├── LICENSE │ ├── README.md │ ├── __init__.py │ ├── entry_point.py │ └── resnet.py ├── run_experiment.py └── transformer │ ├── LICENSE │ ├── README.md │ ├── __init__.py │ ├── entry_point.py │ └── tfmr │ ├── Beam.py │ ├── Constants.py │ ├── Layers.py │ ├── Models.py │ ├── Modules.py │ ├── Optim.py │ ├── SubLayers.py │ ├── Translator.py │ └── __init__.py └── tools ├── device-metadata ├── README.md └── measure_peak_flops.py ├── kernel-metadata ├── extract.sh └── process-cuobjdump-output.py └── recording ├── combine_data.py ├── database.py ├── features.py ├── record_bmm.py ├── record_common.py ├── record_conv2d.py ├── record_linear.py ├── record_lstm.py └── to_dataset.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.swp 3 | 4 | # PyTorch serialized modules 5 | *.pt 6 | 7 | analyzer/habitat/data/LICENSE 8 | analyzer/habitat/data/NOTICE 9 | analyzer/habitat/data/README.md 10 | analyzer/habitat/data/kernels.sqlite 11 | analyzer/habitat/data/**/*.pth 12 | 13 | # Python 14 | *.pyc 15 | __pycache__ 16 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "cpp/external/pybind11"] 2 | path = cpp/external/pybind11 3 | url = https://github.com/pybind/pybind11 4 | [submodule "cpp/external/gflags"] 5 | path = cpp/external/gflags 6 | url = https://github.com/gflags/gflags 7 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use Habitat, please cite it as below." 3 | authors: 4 | - family-names: "Yu" 5 | given-names: "Geoffrey X." 6 | - family-names: "Gao" 7 | given-names: "Yubo" 8 | - family-names: "Golikov" 9 | given-names: "Pavel" 10 | - family-names: "Pekhimenko" 11 | given-names: "Gennady" 12 | title: "Habitat: A Runtime-Based Computational Performance Predictor for Deep Neural Network Training" 13 | version: 1.0.0 14 | doi: 10.5281/zenodo.4885489 15 | date-released: 2021-06-01 16 | url: "https://github.com/geoffxy/habitat" 17 | preferred-citation: 18 | type: conference-paper 19 | authors: 20 | - family-names: "Yu" 21 | given-names: "Geoffrey X." 22 | - family-names: "Gao" 23 | given-names: "Yubo" 24 | - family-names: "Golikov" 25 | given-names: "Pavel" 26 | - family-names: "Pekhimenko" 27 | given-names: "Gennady" 28 | collection-title: "Proceedings of the 2021 USENIX Annual Technical Conference (USENIX ATC '21)" 29 | start: 503 30 | end: 521 31 | title: "Habitat: A Runtime-Based Computational Performance Predictor for Deep Neural Network Training" 32 | month: 7 33 | year: 2021 34 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright 2021 Geoffrey Yu 2 | Copyright 2021 Yubo Gao 3 | Copyright 2021 Pavel Golikov 4 | Copyright 2021 Gennady Pekhimenko 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this project except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Habitat: A Runtime-Based Computational Performance Predictor for Deep Neural Network Training 2 | 3 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4885489.svg)](https://doi.org/10.5281/zenodo.4885489) 4 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4876277.svg)](https://doi.org/10.5281/zenodo.4876277) 5 | 6 | Habitat is a tool that predicts a deep neural network's training iteration 7 | execution time on a given GPU. It currently supports PyTorch. To learn more 8 | about how Habitat works, please see our [research 9 | paper](https://arxiv.org/abs/2102.00527). 10 | 11 | 12 | ## Running From Source 13 | 14 | Currently, the only way to run Habitat is to build it from source. You should 15 | use the Docker image provided in this repository to make sure that you can 16 | compile the code. 17 | 18 | 1. Download the [Habitat pre-trained 19 | models](https://doi.org/10.5281/zenodo.4876277). 20 | 2. Run `extract-models.sh` under `analyzer` to extract and install the 21 | pre-trained models. 22 | 3. Run `setup.sh` under `docker/` to build the Habitat container image. 23 | 4. Run `start.sh` to start a new container. By default, your home directory 24 | will be mounted inside the container under `~/home`. 25 | 5. Once inside the container, run `install-dev.sh` under `analyzer/` to build 26 | and install the Habitat package. 27 | 6. In your scripts, `import habitat` to get access to Habitat. See 28 | `experiments/run_experiment.py` for an example showing how to use Habitat. 29 | 30 | **Note:** Habitat needs access to your GPU's performance counters, which 31 | requires special permissions if you are running with a recent driver (418.43 or 32 | later). If you encounter a `CUPTI_ERROR_INSUFFICIENT_PRIVILEGES` error when 33 | running Habitat, please follow the instructions 34 | [here](https://developer.nvidia.com/ERR_NVGPUCTRPERM) 35 | and in [issue #5](https://github.com/geoffxy/habitat/issues/5). 36 | 37 | 38 | ## License 39 | 40 | The code in this repository is licensed under the Apache 2.0 license (see 41 | `LICENSE` and `NOTICE`), with the exception of the files mentioned below. 42 | 43 | This software contains source code provided by NVIDIA Corporation. These files 44 | are: 45 | 46 | - The code under `cpp/external/cupti_profilerhost_util/` (CUPTI sample code) 47 | - `cpp/src/cuda/cuda_occupancy.h` 48 | 49 | The code mentioned above is licensed under the [NVIDIA Software Development 50 | Kit End User License Agreement](https://docs.nvidia.com/cuda/eula/index.html). 51 | 52 | We include the implementations of several deep neural networks under 53 | `experiments/` for our evaluation. These implementations are copyrighted by 54 | their original authors and carry their original licenses. Please see the 55 | corresponding `README` files and license files inside the subdirectories for 56 | more information. 57 | 58 | 59 | ## Research Paper 60 | 61 | Habitat began as a research project in the [EcoSystem 62 | Group](https://www.cs.toronto.edu/ecosystem) at the [University of 63 | Toronto](https://cs.toronto.edu). The accompanying research paper will appear 64 | in the proceedings of [USENIX 65 | ATC'21](https://www.usenix.org/conference/atc21/presentation/yu). If you are 66 | interested, you can read a preprint of the paper 67 | [here](https://arxiv.org/abs/2102.00527). 68 | 69 | If you use Habitat in your research, please consider citing our paper: 70 | 71 | ```bibtex 72 | @inproceedings{habitat-yu21, 73 | author = {Yu, Geoffrey X. and Gao, Yubo and Golikov, Pavel and Pekhimenko, 74 | Gennady}, 75 | title = {{Habitat: A Runtime-Based Computational Performance Predictor for 76 | Deep Neural Network Training}}, 77 | booktitle = {{Proceedings of the 2021 USENIX Annual Technical Conference 78 | (USENIX ATC'21)}}, 79 | year = {2021}, 80 | } 81 | ``` 82 | -------------------------------------------------------------------------------- /analyzer/.gitignore: -------------------------------------------------------------------------------- 1 | habitat_predict.egg-info 2 | habitat/habitat_cuda.cpython-36m-x86_64-linux-gnu.so 3 | -------------------------------------------------------------------------------- /analyzer/README.md: -------------------------------------------------------------------------------- 1 | Habitat 2 | ======= 3 | This directory contains the Python source code for Habitat—a tool that predicts 4 | the execution time of DNN operations across different GPUs. 5 | -------------------------------------------------------------------------------- /analyzer/extract-models.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | set -e 4 | 5 | if [ -z $1 ]; then 6 | >&2 echo "Usage: $0 path/to/habitat-models.tar.gz" 7 | >&2 echo "" 8 | >&2 echo "This script extracts and installs Habitat's pre-trained models." 9 | exit 1 10 | fi 11 | 12 | archive_loc=$(pwd)/$1 13 | 14 | script_loc=$(cd $(dirname $0) && pwd -P) 15 | cd $script_loc 16 | 17 | tar xzf $archive_loc -C habitat/data/ 18 | cd habitat/data/ 19 | 20 | ./verify.sh 21 | -------------------------------------------------------------------------------- /analyzer/habitat/__init__.py: -------------------------------------------------------------------------------- 1 | from habitat.analysis import Device 2 | from habitat.analysis.metrics import Metric 3 | from habitat.analysis.predictor import Predictor 4 | from habitat.tracking.operation import OperationTracker 5 | 6 | __version__ = '1.0.0' 7 | __description__ = 'Cross-GPU performance predictions for PyTorch neural network training.' 8 | 9 | __author__ = 'Geoffrey Yu' 10 | __email__ = 'gxyu@cs.toronto.edu' 11 | 12 | __license__ = 'Apache-2.0' 13 | 14 | __all__ = [ 15 | 'Device', 16 | 'Metric', 17 | 'OperationTracker', 18 | 'Predictor', 19 | ] 20 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | from habitat.analysis.device import _Device 2 | 3 | Device = _Device() 4 | 5 | SPECIAL_OPERATIONS = { 6 | # Convolution 7 | 'conv2d', 8 | 9 | # Matrix multiply operations 10 | 'linear', 11 | 'bmm', 12 | 13 | # Recurrent operations 14 | 'lstm', 15 | 'gru', 16 | 'rnn_tanh', 17 | 'rnn_relu', 18 | } 19 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/arguments.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class Arguments: 5 | """ 6 | Stores representations of an operation's arguments. 7 | """ 8 | def __init__(self, args, kwargs): 9 | self.args = args 10 | self.kwargs = kwargs 11 | self.special = {} 12 | 13 | @classmethod 14 | def from_raw_arguments(cls, args, kwargs): 15 | processed_args = tuple(map(_process_argument, args)) 16 | processed_kwargs = { 17 | arg_name: _process_argument(arg_value) 18 | for arg_name, arg_value in kwargs.items() 19 | } 20 | return cls(processed_args, processed_kwargs) 21 | 22 | 23 | def _process_argument(argument): 24 | if isinstance(argument, tuple): 25 | return tuple(map(_process_argument, argument)) 26 | 27 | if isinstance(argument, list): 28 | return list(map(_process_argument, argument)) 29 | 30 | # At this point we expect the argument to either be a 31 | # torch.Tensor or to be a scalar (e.g., an integer). 32 | if isinstance(argument, torch.Tensor): 33 | # We only store the tensor dimensions 34 | return argument.size() 35 | else: 36 | return argument 37 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/device.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # This singleton class simulates an enum that consists of the GPU devices we 4 | # support. Users can access a device using its identifier (e.g., Device.V100). 5 | class _Device: 6 | def __init__(self): 7 | self._devices = None 8 | 9 | def __getattr__(self, device_name): 10 | if self._devices is None: 11 | # Lazily load the devices on the first access 12 | self._load_devices() 13 | return self._devices[device_name] 14 | 15 | def _load_devices(self): 16 | import yaml 17 | import habitat.habitat_cuda as hc 18 | import habitat.data as hd 19 | with open(hd.path_to_data('devices.yml')) as devices_yaml: 20 | devices = yaml.load(devices_yaml, Loader=yaml.Loader) 21 | self._devices = { 22 | device_name: hc.DeviceProperties(name=device_name, **properties) 23 | for device_name, properties in devices.items() 24 | } 25 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/kernels.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class MeasuredKernel: 4 | def __init__(self, time_kernel, metrics_kernels, device): 5 | self._c = time_kernel 6 | self._metrics_kernels = metrics_kernels 7 | self._device = device 8 | self._cached_metrics = {} 9 | 10 | def get_metric(self, metric_info, default=None): 11 | if metric_info in self._cached_metrics: 12 | return self._cached_metrics[metric_info] 13 | 14 | for metric_kernel in self._metrics_kernels: 15 | for raw_metric_name, raw_metric_value in metric_kernel.metrics: 16 | if (raw_metric_name == metric_info.value.cupti_name or 17 | raw_metric_name == metric_info.value.legacy_cupti_name): 18 | canonical_value = metric_info.value.to_canonical_value( 19 | raw_metric_value, self._device) 20 | self._cached_metrics[metric_info] = canonical_value 21 | return canonical_value 22 | 23 | if default is None: 24 | raise AttributeError('Unknown metric: {}'.format(metric_info.name)) 25 | 26 | return default 27 | 28 | def __getattr__(self, name): 29 | # Delegate to the underlying C++ object for non-overridden attributes 30 | return getattr(self._c, name) 31 | 32 | 33 | class PredictedKernel: 34 | def __init__(self, measured_kernel, run_time_ns): 35 | self._measured_kernel = measured_kernel 36 | self._run_time_ns = run_time_ns 37 | 38 | @property 39 | def run_time_ns(self): 40 | return self._run_time_ns 41 | 42 | @property 43 | def name(self): 44 | return self._measured_kernel.name 45 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/metrics.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class _MetricInfo: 5 | def __init__( 6 | self, 7 | cupti_name, 8 | legacy_cupti_name, 9 | legacy_to_canonical_fn 10 | ): 11 | self._cupti_name = cupti_name 12 | self._legacy_cupti_name = legacy_cupti_name 13 | self._legacy_to_canonical_fn = legacy_to_canonical_fn 14 | 15 | @property 16 | def cupti_name(self): 17 | return self._cupti_name 18 | 19 | @property 20 | def legacy_cupti_name(self): 21 | return self._legacy_cupti_name 22 | 23 | def to_canonical_value(self, value, device): 24 | if device.compute_capability[0] >= 7: 25 | return value 26 | return self._legacy_to_canonical_fn(value) 27 | 28 | 29 | class Metric(Enum): 30 | DRAMUtilization = _MetricInfo( 31 | 'dram__throughput.avg.pct_of_peak_sustained_elapsed', 32 | 'dram_utilization', 33 | lambda value: value * 10, 34 | ) 35 | DRAMReadBytes = _MetricInfo( 36 | 'dram__bytes_read.sum', 37 | 'dram_read_bytes', 38 | lambda value: value, 39 | ) 40 | DRAMWriteBytes = _MetricInfo( 41 | 'dram__bytes_write.sum', 42 | 'dram_write_bytes', 43 | lambda value: value, 44 | ) 45 | SinglePrecisionFLOPEfficiency = _MetricInfo( 46 | 'smsp__sass_thread_inst_executed_ops_fadd_fmul_ffma_pred_on.avg.pct_of_peak_sustained_elapsed', 47 | 'flop_sp_efficiency', 48 | lambda value: value, 49 | ) 50 | SinglePrecisionAddOps = _MetricInfo( 51 | 'smsp__sass_thread_inst_executed_op_fadd_pred_on.sum', 52 | 'flop_count_sp_add', 53 | lambda value: value, 54 | ) 55 | 56 | 57 | def resolve_metrics(metrics, device): 58 | """ 59 | Converts Metric enum values into raw metric strings that can be passed to 60 | CUPTI, depending on the compute capability of the given device. 61 | 62 | This is needed because the metrics names changed after (and including) 63 | compute capability 7.0 (Volta). 64 | 65 | If the metrics passed in are already resolved, this function will return a 66 | copy of them. 67 | """ 68 | if metrics is None: 69 | return [] 70 | 71 | if isinstance(metrics, list) or isinstance(metrics, tuple): 72 | return [ 73 | _get_metric_name(metric, device) 74 | for metric in metrics 75 | ] 76 | else: 77 | return [_get_metric_name(metrics, device)] 78 | 79 | 80 | def _get_metric_name(metric, device): 81 | if isinstance(metric, Metric): 82 | return ( 83 | metric.value.cupti_name 84 | if device.compute_capability[0] >= 7 85 | else metric.value.legacy_cupti_name 86 | ) 87 | else: 88 | return metric 89 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/mlp/.gitignore: -------------------------------------------------------------------------------- 1 | logs 2 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/mlp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geoffxy/habitat/5f01e523a1dc30dbfbaaa39cf4880a534c7781a2/analyzer/habitat/analysis/mlp/__init__.py -------------------------------------------------------------------------------- /analyzer/habitat/analysis/mlp/dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from torch.utils.data import Dataset 5 | 6 | from habitat.analysis.mlp.dataset_process import get_dataset 7 | 8 | 9 | class HabitatDataset(Dataset): 10 | def __init__(self, dataset_path, features): 11 | self.x, self.y = get_dataset(dataset_path, features) 12 | 13 | # input normalization 14 | self.x = np.array(self.x) 15 | 16 | self.mu = np.mean(self.x, axis=0) 17 | self.sigma = np.std(self.x, axis=0) 18 | 19 | self.x = np.divide(np.subtract(self.x, self.mu), self.sigma) 20 | 21 | def __len__(self): 22 | return len(self.y) 23 | 24 | def __getitem__(self, idx): 25 | if torch.is_tensor(idx): 26 | idx = idx.tolist() 27 | 28 | return torch.from_numpy(np.array(self.x[idx]).astype(np.float32), ), float(self.y[idx]) 29 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/mlp/dataset_process.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import pandas as pd 3 | import glob 4 | import functools 5 | from tqdm import tqdm 6 | 7 | from habitat.analysis.mlp.devices import get_device_features, get_all_devices 8 | 9 | 10 | def get_dataset(path, features, device_features=None): 11 | if device_features is None: 12 | device_features = ['mem', 'mem_bw', 'num_sm', 'single'] 13 | 14 | SELECT_QUERY = """ 15 | SELECT {features}, SUM(run_time_ms) AS run_time_ms 16 | FROM recordings 17 | GROUP BY {features} 18 | """ 19 | 20 | # read datasets 21 | files = glob.glob(path + "/*.sqlite") 22 | 23 | # read individual sqlite files and categorize by device 24 | devices = dict() 25 | for f in files: 26 | device_name = f.split("/")[-1].split("-")[1] 27 | 28 | conn = sqlite3.connect(f) 29 | query = SELECT_QUERY.format(features=",".join(features)) 30 | 31 | df = pd.read_sql_query(query, conn) 32 | df = df.rename(columns={"run_time_ms": device_name}) 33 | 34 | print("Loaded file %s (%d entries)" % (f, len(df.index))) 35 | 36 | if device_name not in devices: 37 | devices[device_name] = df 38 | else: 39 | devices[device_name] = devices[device_name].append(df) 40 | 41 | for device in devices.keys(): 42 | print("Device %s contains %d entries" % (device, len(devices[device].index))) 43 | 44 | print() 45 | 46 | print("Merging") 47 | df_merged = functools.reduce( 48 | lambda df1, df2: pd.merge(df1, df2, on=features), 49 | devices.values() 50 | ) 51 | 52 | print("Generating dataset") 53 | # generate vectorized dataset (one entry for each device with device params) 54 | device_params = get_all_devices(device_features) 55 | 56 | x, y = [], [] 57 | for device in devices.keys(): 58 | df_merged_device = df_merged[features + [device, ]] 59 | for row in tqdm(df_merged_device.iterrows(), leave=False, desc=device, total=len(df_merged_device.index)): 60 | row = row[1] 61 | 62 | x.append(list(row[:-1]) + device_params[device]) 63 | y.append(row[-1]) 64 | 65 | return x, y 66 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/mlp/devices.csv: -------------------------------------------------------------------------------- 1 | device,mem,mem_type,mem_bw,num_sm,double,single,half 2 | P100,16,HBM2,732,56,4.7,9.3,18.7 3 | P4000,8,GDDR5,243,14,0.1656,5.3, 4 | RTX2070,8,GDDR6,448,36,0.20304,6.49728,12.99456 5 | RTX2080Ti,11,GDDR6,616,68,0.3672,11.7504,23.5008 6 | T4,16,GDDR6,320,40,,8.1, 7 | V100,16,HBM2,900,80,,14.028, 8 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/mlp/devices.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | 5 | def get_device_features(device_name, device_params): 6 | file_dir = os.path.abspath(os.path.dirname(__file__)) 7 | df = pd.read_csv(os.path.join(file_dir, "devices.csv")) 8 | df = df[['device', ] + device_params] 9 | result = df[df['device'] == device_name].iloc[0] 10 | return list(result)[1:] 11 | 12 | def get_all_devices(device_params=None): 13 | file_dir = os.path.abspath(os.path.dirname(__file__)) 14 | df = pd.read_csv(os.path.join(file_dir, "devices.csv")) 15 | if type(device_params) is list: 16 | df = df[['device',] + device_params] 17 | 18 | return { 19 | row[1]: list(row[2:]) for row in df.itertuples() 20 | } 21 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/mlp/saved_models/.gitignore: -------------------------------------------------------------------------------- 1 | bmm/* 2 | conv2d/* 3 | linear/* 4 | lstm/* 5 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/mlp/train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import random 3 | import torch 4 | import numpy 5 | 6 | from habitat.analysis.mlp.mlp import RuntimePredictor 7 | 8 | 9 | def main(): 10 | parser = argparse.ArgumentParser(description="MLP Training Script") 11 | parser.add_argument("operation", type=str) 12 | parser.add_argument("dataset_path", type=str) 13 | parser.add_argument("--layers", type=int, default=8) 14 | parser.add_argument("--layer_size", type=int, default=1024) 15 | parser.add_argument("--epochs", type=int, default=80) 16 | parser.add_argument("--seed", type=int, default=1337) 17 | 18 | args = parser.parse_args() 19 | 20 | # Ensure reproducibility 21 | random.seed(args.seed) 22 | torch.manual_seed(args.seed) 23 | numpy.random.seed(args.seed) 24 | 25 | predictor = RuntimePredictor(args.operation, args.layers, args.layer_size) 26 | predictor.train_with_dataset(args.dataset_path, epochs=args.epochs) 27 | 28 | 29 | if __name__ == "__main__": 30 | main() 31 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/operation.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Operation: 4 | """ 5 | Abstract representation of a logical operation in a model. 6 | """ 7 | def __repr__(self): 8 | return self.name 9 | 10 | @property 11 | def run_time_ms(self): 12 | if self.backward is None: 13 | return self.forward.run_time_ms 14 | return self.forward.run_time_ms + self.backward.run_time_ms 15 | 16 | @property 17 | def ktime_ns(self): 18 | if self.backward is None: 19 | return self.forward.ktime_ns 20 | return self.forward.ktime_ns + self.backward.ktime_ns 21 | 22 | @property 23 | def arguments(self): 24 | return None 25 | 26 | @property 27 | def forward(self): 28 | raise NotImplementedError 29 | 30 | @property 31 | def backward(self): 32 | raise NotImplementedError 33 | 34 | @property 35 | def name(self): 36 | raise NotImplementedError 37 | 38 | @property 39 | def device(self): 40 | raise NotImplementedError 41 | 42 | def to_device(self, dest_device, predictor): 43 | raise NotImplementedError 44 | 45 | 46 | class MeasuredOperation(Operation): 47 | def __init__( 48 | self, 49 | name, 50 | arguments, 51 | forward, 52 | backward, 53 | device, 54 | ): 55 | super().__init__() 56 | self._name = name 57 | self._arguments = arguments 58 | self._forward = forward 59 | self._backward = backward 60 | self._device = device 61 | 62 | @property 63 | def name(self): 64 | return self._name 65 | 66 | @property 67 | def arguments(self): 68 | return self._arguments 69 | 70 | @property 71 | def forward(self): 72 | return self._forward 73 | 74 | @property 75 | def backward(self): 76 | return self._backward 77 | 78 | @property 79 | def device(self): 80 | return self._device 81 | 82 | def to_device(self, dest_device, predictor): 83 | if dest_device.name == self._device.name: 84 | return self 85 | return predictor.predict_operation(self, dest_device) 86 | 87 | 88 | class PredictedOperation(Operation): 89 | def __init__( 90 | self, 91 | measured_operation, 92 | forward, 93 | backward, 94 | device 95 | ): 96 | self._measured_operation = measured_operation 97 | self._forward = forward 98 | self._backward = backward 99 | self._device = device 100 | 101 | @property 102 | def name(self): 103 | return self._measured_operation.name 104 | 105 | @property 106 | def arguments(self): 107 | return self._measured_operation.arguments 108 | 109 | @property 110 | def forward(self): 111 | return self._forward 112 | 113 | @property 114 | def backward(self): 115 | return self._backward 116 | 117 | @property 118 | def device(self): 119 | return self._device 120 | 121 | def to_device(self, dest_device, predictor): 122 | raise RuntimeError( 123 | 'Cannot make a prediction using a predicted operation.', 124 | ) 125 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/run_time.py: -------------------------------------------------------------------------------- 1 | from habitat.analysis.kernels import MeasuredKernel 2 | from habitat.utils import ns_to_ms 3 | 4 | 5 | class RunTime: 6 | @property 7 | def run_time_ms(self): 8 | raise NotImplementedError 9 | 10 | @property 11 | def ktime_ns(self): 12 | return sum(map(lambda k: k.run_time_ns, self.kernels)) 13 | 14 | @property 15 | def kernels(self): 16 | return [] 17 | 18 | @property 19 | def device(self): 20 | raise NotImplementedError 21 | 22 | 23 | class RunTimeMeasurement(RunTime): 24 | def __init__(self, run_time_ms, kernels, device): 25 | self._run_time_ms = run_time_ms 26 | self._kernels = kernels 27 | self._device = device 28 | 29 | @property 30 | def run_time_ms(self): 31 | return self._run_time_ms 32 | 33 | @property 34 | def kernels(self): 35 | return self._kernels 36 | 37 | @property 38 | def device(self): 39 | return self._device 40 | 41 | 42 | class RunTimePrediction(RunTime): 43 | def __init__(self, overhead_ns, predicted_kernels, device): 44 | self._run_time_ms = None 45 | self._overhead_ns = overhead_ns 46 | self._predicted_kernels = predicted_kernels 47 | self._device = device 48 | 49 | @property 50 | def run_time_ms(self): 51 | if self._run_time_ms is not None: 52 | return self._run_time_ms 53 | run_time_ns = self._overhead_ns + sum(map( 54 | lambda k: k.run_time_ns, 55 | self.kernels, 56 | )) 57 | self._run_time_ms = ns_to_ms(run_time_ns) 58 | return self._run_time_ms 59 | 60 | @property 61 | def kernels(self): 62 | return self._predicted_kernels 63 | 64 | @property 65 | def device(self): 66 | return self._device 67 | 68 | 69 | class RunTimePurePrediction(RunTime): 70 | def __init__(self, run_time_ms, device): 71 | self._run_time_ms = run_time_ms 72 | self._device = device 73 | 74 | @property 75 | def run_time_ms(self): 76 | if self._run_time_ms is not None: 77 | return self._run_time_ms 78 | run_time_ns = self._overhead_ns + sum(map( 79 | lambda k: k.run_time_ns, 80 | self.kernels, 81 | )) 82 | self._run_time_ms = ns_to_ms(run_time_ns) 83 | return self._run_time_ms 84 | 85 | @property 86 | def device(self): 87 | return self._device 88 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/trace.py: -------------------------------------------------------------------------------- 1 | from itertools import chain 2 | from habitat.analysis.predictor import Predictor 3 | 4 | 5 | class Trace: 6 | """ 7 | Represents an operation trace that was measured on a given device. 8 | """ 9 | 10 | # Used by default to make cross-device predictions. 11 | DefaultPredictor = Predictor() 12 | 13 | def __init__(self, device, operations): 14 | self._device = device 15 | self._operations = operations 16 | self._run_time_ms = None 17 | 18 | @property 19 | def operations(self): 20 | return self._operations 21 | 22 | @property 23 | def device(self): 24 | return self._device 25 | 26 | @property 27 | def run_time_ms(self): 28 | if self._run_time_ms is not None: 29 | return self._run_time_ms 30 | 31 | self._run_time_ms = sum(map( 32 | lambda op: op.run_time_ms, 33 | self._operations, 34 | )) 35 | 36 | return self._run_time_ms 37 | 38 | def to_device(self, dest_device, predictor=None): 39 | """Get a predicted trace for the specified device.""" 40 | if dest_device.name == self.device.name: 41 | return self 42 | 43 | actual_predictor = ( 44 | Trace.DefaultPredictor if predictor is None else predictor 45 | ) 46 | 47 | operations = [ 48 | operation.to_device(dest_device, actual_predictor) 49 | for operation in self._operations 50 | ] 51 | return Trace(dest_device, operations) 52 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/wave_scaling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geoffxy/habitat/5f01e523a1dc30dbfbaaa39cf4880a534c7781a2/analyzer/habitat/analysis/wave_scaling/__init__.py -------------------------------------------------------------------------------- /analyzer/habitat/analysis/wave_scaling/common.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def calculate_wave_info(kernel, origin_device, dest_device, metadata_manager): 4 | origin_occupancy = kernel.thread_block_occupancy(origin_device) 5 | origin_wave_size = origin_device.num_sms * origin_occupancy 6 | 7 | dest_registers_per_thread = metadata_manager.kernel_registers_for( 8 | kernel, 9 | dest_device, 10 | ) 11 | if dest_registers_per_thread is not None: 12 | dest_occupancy = kernel.thread_block_occupancy( 13 | dest_device, 14 | dest_registers_per_thread, 15 | ) 16 | else: 17 | dest_occupancy = kernel.thread_block_occupancy(dest_device) 18 | dest_wave_size = dest_device.num_sms * dest_occupancy 19 | 20 | return origin_wave_size, dest_wave_size, origin_occupancy, dest_occupancy 21 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/wave_scaling/metadata.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import logging 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class MetadataManager: 8 | def __init__(self, path_to_lut): 9 | self._connection = sqlite3.connect(path_to_lut) 10 | 11 | 12 | def kernel_registers_for(self, kernel, device): 13 | arch = int(''.join(map(lambda x: str(x), device.compute_capability))) 14 | cursor = self._connection.cursor() 15 | result = cursor.execute( 16 | MetadataManager.kernel_registers_query, 17 | (kernel.name, arch), 18 | ).fetchone() 19 | 20 | if result is None: 21 | logger.debug( 22 | 'Missing kernel metadata entry for "%s" on arch %d.', 23 | kernel.name, 24 | arch, 25 | ) 26 | return result 27 | 28 | actual_arch, registers_per_thread = result 29 | if actual_arch != arch: 30 | logger.debug( 31 | 'Using substitute entry for "%s" at arch %d instead of %d.', 32 | kernel.name, 33 | actual_arch, 34 | arch, 35 | ) 36 | 37 | return registers_per_thread 38 | 39 | 40 | MetadataManager.kernel_registers_query = """ 41 | SELECT arch, registers_per_thread FROM kernels 42 | WHERE name = ? AND arch <= ? 43 | ORDER BY arch DESC LIMIT 1 44 | """ 45 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/wave_scaling/resimplified.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from habitat.analysis.kernels import PredictedKernel 4 | from habitat.analysis.wave_scaling.common import calculate_wave_info 5 | 6 | 7 | def resimplified_wave_scaling( 8 | kernel, 9 | origin_device, 10 | dest_device, 11 | metadata_manager, 12 | ): 13 | origin_wave_size, dest_wave_size, origin_occupancy, dest_occupancy = ( 14 | calculate_wave_info( 15 | kernel, 16 | origin_device, 17 | dest_device, 18 | metadata_manager, 19 | ) 20 | ) 21 | 22 | # Check if the kernel is too "small" - if it doesn't fill a single wave 23 | # on the current device AND if it doesn't fill a single wave on the 24 | # destination device 25 | if (kernel.num_blocks // origin_wave_size == 0 and 26 | kernel.num_blocks // dest_wave_size == 0): 27 | # We scale the run time by the compute factor only 28 | origin_max_occupancy = math.ceil( 29 | kernel.num_blocks / origin_device.num_sms 30 | ) 31 | dest_max_occupancy = math.ceil( 32 | kernel.num_blocks / dest_device.num_sms 33 | ) 34 | return PredictedKernel(kernel, kernel.run_time_ns) 35 | 36 | bandwidth_ratio = ( 37 | origin_device.mem_bandwidth_gb / dest_device.mem_bandwidth_gb 38 | ) 39 | 40 | return PredictedKernel(kernel, kernel.run_time_ns * bandwidth_ratio) 41 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/wave_scaling/roofline.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from habitat.analysis.metrics import Metric 4 | from habitat.analysis.kernels import PredictedKernel 5 | from habitat.analysis.wave_scaling.common import calculate_wave_info 6 | 7 | 8 | def roofline_wave_scaling( 9 | kernel, 10 | origin_device, 11 | dest_device, 12 | metadata_manager, 13 | ): 14 | gamma = _roofline_gamma(kernel, origin_device, dest_device) 15 | gamma_compl = 1.0 - gamma 16 | 17 | origin_wave_size, dest_wave_size, origin_occupancy, dest_occupancy = ( 18 | calculate_wave_info( 19 | kernel, 20 | origin_device, 21 | dest_device, 22 | metadata_manager, 23 | ) 24 | ) 25 | 26 | # 1. Check if the kernel is too "small" - if it doesn't fill a single wave 27 | # on the current device AND if it doesn't fill a single wave on the 28 | # destination device 29 | if (kernel.num_blocks // origin_wave_size == 0 and 30 | kernel.num_blocks // dest_wave_size == 0): 31 | # We scale the run time by the compute factor only 32 | origin_max_occupancy = math.ceil( 33 | kernel.num_blocks / origin_device.num_sms 34 | ) 35 | dest_max_occupancy = math.ceil( 36 | kernel.num_blocks / dest_device.num_sms 37 | ) 38 | partial_compute_factor = ( 39 | (origin_device.base_clock_mhz / dest_device.base_clock_mhz) * 40 | (dest_max_occupancy / origin_max_occupancy) 41 | ) 42 | return PredictedKernel( 43 | kernel, 44 | kernel.run_time_ns * math.pow(partial_compute_factor, gamma_compl), 45 | ) 46 | 47 | # 2. Compute the three scaling factors 48 | bandwidth_factor = ( 49 | origin_device.mem_bandwidth_gb / dest_device.mem_bandwidth_gb 50 | ) 51 | clock_factor = ( 52 | origin_device.base_clock_mhz / dest_device.base_clock_mhz 53 | ) 54 | sm_factor = ( 55 | origin_device.num_sms / dest_device.num_sms 56 | ) 57 | 58 | # 3. Scale and return the predicted run time 59 | scaled_run_time_ns = ( 60 | kernel.run_time_ns * 61 | math.pow(bandwidth_factor, gamma) * 62 | math.pow(clock_factor, gamma_compl) * 63 | math.pow(sm_factor, gamma_compl) 64 | ) 65 | return PredictedKernel(kernel, scaled_run_time_ns) 66 | 67 | 68 | def _roofline_gamma(kernel, origin_device, dest_device): 69 | flop_efficiency = kernel.get_metric(Metric.SinglePrecisionFLOPEfficiency) 70 | dram_read_bytes = kernel.get_metric(Metric.DRAMReadBytes) 71 | dram_write_bytes = kernel.get_metric(Metric.DRAMWriteBytes) 72 | total_gb = (dram_read_bytes + dram_write_bytes) / 1024 / 1024 / 1024 73 | 74 | gflops_per_second = flop_efficiency / 100 * origin_device.peak_gflops_per_second 75 | num_gflops = gflops_per_second * kernel.run_time_ns / 1e9 76 | 77 | # We only consider the dest ridge point (R). 78 | # We use a decreasing linear function to interpolate between an intensity 79 | # of 0 and R, and use a 1/x function to map intensities greater than R. 80 | # 81 | # gamma = -0.5/R * intensity + 1 if 0 <= intensity <= R 82 | # 0.5R / intensity otherwise 83 | 84 | if num_gflops < 1e-9: 85 | # We treat these cases as fully memory bandwidth bound, even though 86 | # total_gb could also be 0 87 | gamma = 1. 88 | 89 | elif total_gb == 0: 90 | # num_gflops must be non-zero, so this means the kernel is fully 91 | # compute bound 92 | gamma = 0. 93 | 94 | else: 95 | intensity_gflops_per_gb = num_gflops / total_gb 96 | dest_ridge_point = _ridge_point(dest_device) 97 | 98 | if intensity_gflops_per_gb > dest_ridge_point: 99 | gamma = 0.5 * dest_ridge_point / intensity_gflops_per_gb 100 | else: 101 | gamma = -0.5 / dest_ridge_point * intensity_gflops_per_gb + 1. 102 | 103 | assert gamma >= 0 and gamma <= 1 104 | return gamma 105 | 106 | 107 | def _ridge_point(device): 108 | return device.peak_gflops_per_second / device.mem_bandwidth_gb 109 | -------------------------------------------------------------------------------- /analyzer/habitat/analysis/wave_scaling/unified.py: -------------------------------------------------------------------------------- 1 | from habitat.analysis.metrics import Metric 2 | from habitat.analysis.wave_scaling.resimplified import ( 3 | resimplified_wave_scaling, 4 | ) 5 | from habitat.analysis.wave_scaling.roofline import roofline_wave_scaling 6 | 7 | 8 | def unified_wave_scaling( 9 | kernel, 10 | origin_device, 11 | dest_device, 12 | metadata_manager, 13 | ): 14 | try: 15 | # Try reading metrics. These calls will raise exceptions if the metrics 16 | # do not exist. 17 | _ = kernel.get_metric(Metric.SinglePrecisionFLOPEfficiency) 18 | _ = kernel.get_metric(Metric.DRAMReadBytes) 19 | _ = kernel.get_metric(Metric.DRAMWriteBytes) 20 | return roofline_wave_scaling( 21 | kernel, 22 | origin_device, 23 | dest_device, 24 | metadata_manager, 25 | ) 26 | except AttributeError: 27 | pass 28 | 29 | # Use resimplified wave scaling when metrics are unavailable 30 | return resimplified_wave_scaling( 31 | kernel, 32 | origin_device, 33 | dest_device, 34 | metadata_manager, 35 | ) 36 | -------------------------------------------------------------------------------- /analyzer/habitat/data/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | _DATA_PATH = os.path.abspath(os.path.dirname(__file__)) 4 | 5 | 6 | def path_to_data(data_file): 7 | return os.path.join(_DATA_PATH, data_file) 8 | -------------------------------------------------------------------------------- /analyzer/habitat/data/checksums: -------------------------------------------------------------------------------- 1 | d445af3308bc67a087446bed3d7c160fcde458cb bmm/model.pth 2 | b7c4a4ccd2a447a6a9d63e6658461f1f1a9dc08b conv2d/model.pth 3 | 659a00c6cff529613b40d5166fe7d93f42e8327d kernels.sqlite 4 | 930d5a79b35755ae7a8ab72a9b5585423dc02ab7 linear/model.pth 5 | 2da58bcea595de031584da214b29510b9e897466 lstm/model.pth 6 | -------------------------------------------------------------------------------- /analyzer/habitat/data/devices.yml: -------------------------------------------------------------------------------- 1 | # NOTE: All GPU device "names" need to be valid Python identifiers. Therefore 2 | # they cannot start with a numeric character. 3 | 4 | P4000: 5 | compute_major: 6 6 | compute_minor: 1 7 | max_threads_per_block: 1024 8 | max_threads_per_multiprocessor: 2048 9 | regs_per_block: 65536 10 | regs_per_multiprocessor: 65536 11 | warp_size: 32 12 | shared_mem_per_block: 49152 13 | shared_mem_per_multiprocessor: 98304 14 | num_sms: 14 15 | shared_mem_per_block_optin: 49152 16 | mem_bandwidth_gb: 195 17 | base_clock_mhz: 1227 18 | peak_gflops_per_second: 6054 19 | 20 | GTX1080Ti: 21 | compute_major: 6 22 | compute_minor: 1 23 | max_threads_per_block: 1024 24 | max_threads_per_multiprocessor: 2048 25 | regs_per_block: 65536 26 | regs_per_multiprocessor: 65536 27 | warp_size: 32 28 | shared_mem_per_block: 49152 29 | shared_mem_per_multiprocessor: 98304 30 | num_sms: 28 31 | shared_mem_per_block_optin: 49152 32 | mem_bandwidth_gb: 484 33 | base_clock_mhz: 1480 34 | peak_gflops_per_second: 10609 35 | 36 | RTX2070: 37 | compute_major: 7 38 | compute_minor: 5 39 | max_threads_per_block: 1024 40 | max_threads_per_multiprocessor: 1024 41 | regs_per_block: 65536 42 | regs_per_multiprocessor: 65536 43 | warp_size: 32 44 | shared_mem_per_block: 49152 45 | shared_mem_per_multiprocessor: 65536 46 | num_sms: 36 47 | shared_mem_per_block_optin: 0 48 | mem_bandwidth_gb: 383 49 | base_clock_mhz: 1410 50 | peak_gflops_per_second: 4318 51 | 52 | RTX2080Ti: 53 | compute_major: 7 54 | compute_minor: 5 55 | max_threads_per_block: 1024 56 | max_threads_per_multiprocessor: 1024 57 | regs_per_block: 65536 58 | regs_per_multiprocessor: 65536 59 | warp_size: 32 60 | shared_mem_per_block: 49152 61 | shared_mem_per_multiprocessor: 65536 62 | num_sms: 68 63 | shared_mem_per_block_optin: 0 64 | mem_bandwidth_gb: 524 65 | base_clock_mhz: 1350 66 | peak_gflops_per_second: 5938 67 | 68 | P4: 69 | compute_major: 6 70 | compute_minor: 1 71 | max_threads_per_block: 1024 72 | max_threads_per_multiprocessor: 2048 73 | regs_per_block: 65536 74 | regs_per_multiprocessor: 65536 75 | warp_size: 32 76 | shared_mem_per_block: 49152 77 | shared_mem_per_multiprocessor: 98304 78 | num_sms: 20 79 | shared_mem_per_block_optin: 0 80 | mem_bandwidth_gb: 192 81 | base_clock_mhz: 810 82 | peak_gflops_per_second: 4147 83 | 84 | T4: 85 | compute_major: 7 86 | compute_minor: 5 87 | max_threads_per_block: 1024 88 | max_threads_per_multiprocessor: 1024 89 | regs_per_block: 65536 90 | regs_per_multiprocessor: 65536 91 | warp_size: 32 92 | shared_mem_per_block: 49152 93 | shared_mem_per_multiprocessor: 65536 94 | num_sms: 40 95 | shared_mem_per_block_optin: 0 96 | mem_bandwidth_gb: 239 97 | base_clock_mhz: 585 98 | peak_gflops_per_second: 1804 99 | 100 | V100: 101 | compute_major: 7 102 | compute_minor: 0 103 | max_threads_per_block: 1024 104 | max_threads_per_multiprocessor: 2048 105 | regs_per_block: 65536 106 | regs_per_multiprocessor: 65536 107 | warp_size: 32 108 | shared_mem_per_block: 49152 109 | shared_mem_per_multiprocessor: 98304 110 | num_sms: 80 111 | shared_mem_per_block_optin: 0 112 | mem_bandwidth_gb: 739 113 | base_clock_mhz: 1312 114 | peak_gflops_per_second: 6716 115 | 116 | P100: 117 | compute_major: 6 118 | compute_minor: 0 119 | max_threads_per_block: 1024 120 | max_threads_per_multiprocessor: 2048 121 | regs_per_block: 65536 122 | regs_per_multiprocessor: 65536 123 | warp_size: 32 124 | shared_mem_per_block: 49152 125 | shared_mem_per_multiprocessor: 65536 126 | num_sms: 56 127 | shared_mem_per_block_optin: 0 128 | mem_bandwidth_gb: 501 129 | base_clock_mhz: 1126 130 | peak_gflops_per_second: 9504 131 | -------------------------------------------------------------------------------- /analyzer/habitat/data/verify.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | CHECKSUM_FILE="checksums" 4 | declare -a FILES=( 5 | "bmm/model.pth" 6 | "conv2d/model.pth" 7 | "kernels.sqlite" 8 | "linear/model.pth" 9 | "lstm/model.pth" 10 | ) 11 | 12 | function generate() { 13 | rm -f $CHECKSUM_FILE 14 | for file in "${FILES[@]}"; do 15 | shasum $file >> $CHECKSUM_FILE 16 | done 17 | echo "Done! Checksum file has been generated." 18 | } 19 | 20 | function validate() { 21 | shasum -c $CHECKSUM_FILE 22 | } 23 | 24 | function usage() { 25 | echo "Usage: $0 [-g | --generate]" 26 | echo "" 27 | echo "This utility checks that Habitat's data files have the correct contents." 28 | echo "" 29 | echo "Use the -g or --generate options to generate the checksum file." 30 | exit 1 31 | } 32 | 33 | if [ -z "$1" ]; then 34 | validate 35 | elif [ "$1" = "-g" ] || [ "$1" = "--generate" ]; then 36 | generate 37 | else 38 | usage "$@" 39 | fi 40 | -------------------------------------------------------------------------------- /analyzer/habitat/profiling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geoffxy/habitat/5f01e523a1dc30dbfbaaa39cf4880a534c7781a2/analyzer/habitat/profiling/__init__.py -------------------------------------------------------------------------------- /analyzer/habitat/profiling/autograd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from habitat.profiling.backward import get_grad_fn, flatten_operation_output 4 | 5 | 6 | class AutogradEngine: 7 | """ 8 | Emulates the backward pass for a given model output, for timing purposes. 9 | """ 10 | def __init__(self, grad_fn_ordering, input_map, initial_inputs): 11 | self._grad_fn_ordering = grad_fn_ordering 12 | self._input_holder = { 13 | fn: [None] * size for fn, size in input_map.items() 14 | } 15 | self._input_holder[self._grad_fn_ordering[0]] = initial_inputs 16 | 17 | @classmethod 18 | def new_from(cls, operation_output, exclude_accumulate_grad=True): 19 | # Traverse the autograd graph, build input map for each grad_fn and 20 | # create a topological ordering 21 | _, initial_grad_fn = get_grad_fn(operation_output) 22 | if initial_grad_fn is None: 23 | raise ValueError('No grad_fn available on the operation output.') 24 | 25 | ordering = [] 26 | input_map = {} 27 | initial_inputs = [ 28 | tensor.detach() 29 | for tensor in flatten_operation_output(operation_output) 30 | ] 31 | input_map[initial_grad_fn] = len(initial_inputs) 32 | 33 | stack = [(initial_grad_fn, 0)] 34 | visited = {initial_grad_fn} 35 | 36 | # Build a topological ordering 37 | while len(stack) > 0: 38 | grad_fn, visit_count = stack.pop() 39 | if visit_count != 0: 40 | ordering.append(grad_fn) 41 | continue 42 | 43 | stack.append((grad_fn, 1)) 44 | for next_fn, input_idx in grad_fn.next_functions: 45 | if next_fn is None: 46 | continue 47 | 48 | if (exclude_accumulate_grad and 49 | next_fn.name() == 'torch::autograd::AccumulateGrad'): 50 | continue 51 | 52 | # Keep track of the inputs to each grad_fn 53 | if next_fn not in input_map: 54 | input_map[next_fn] = 1 55 | input_map[next_fn] = max(input_map[next_fn], input_idx + 1) 56 | 57 | # Determine whether to visit this grad_fn 58 | if next_fn in visited: 59 | continue 60 | 61 | visited.add(next_fn) 62 | stack.append((next_fn, 0)) 63 | 64 | ordering.reverse() 65 | return cls(ordering, input_map, initial_inputs) 66 | 67 | def run_backward(self): 68 | for grad_fn in self._grad_fn_ordering: 69 | # 1. Run the backward function 70 | outputs = grad_fn(*(self._input_holder[grad_fn])) 71 | 72 | # 2. Store its outputs for the next backward function(s) 73 | if isinstance(outputs, torch.Tensor): 74 | outputs = [outputs] 75 | for (output, (next_fn, input_idx)) in zip( 76 | outputs, grad_fn.next_functions): 77 | if next_fn is None or next_fn not in self._input_holder: 78 | continue 79 | # NOTE: If implementing to actually calculate the gradient, we 80 | # need to sum gradients that "flow" into the same grad function 81 | # input. 82 | self._input_holder[next_fn][input_idx] = output 83 | -------------------------------------------------------------------------------- /analyzer/habitat/profiling/backward.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class BackwardHelper: 5 | def __init__(self, backward_runnable, ag_dict): 6 | self.run_backward = backward_runnable 7 | self._ag_dict = ag_dict 8 | 9 | @classmethod 10 | def new_from(cls, operation_outputs): 11 | retval, initial_grad_fn = get_grad_fn(operation_outputs) 12 | if initial_grad_fn is None: 13 | raise ValueError('No grad_fn available on the operation output.') 14 | 15 | grads = torch.ones_like(retval) 16 | def backward_runnable(): 17 | torch.autograd.backward(retval, grads, retain_graph=True) 18 | 19 | size_dict = get_accumulate_grad_inputs( 20 | initial_grad_fn, 21 | backward_runnable, 22 | ) 23 | 24 | ag_dict = { 25 | grad_fn: torch.randn(size, device=torch.device('cuda')) 26 | for grad_fn, size in size_dict.items() 27 | } 28 | 29 | return cls(backward_runnable, ag_dict) 30 | 31 | def run_accumulate_grad(self): 32 | for grad_fn, grad in self._ag_dict.items(): 33 | grad_fn(grad) 34 | 35 | 36 | def backward_available(operation_output): 37 | return get_grad_fn(operation_output)[1] is not None 38 | 39 | 40 | def flatten_operation_output(operation_output): 41 | if isinstance(operation_output, torch.Tensor): 42 | return [operation_output] 43 | elif (not isinstance(operation_output, tuple) and 44 | not isinstance(operation_output, list)): 45 | return [] 46 | 47 | flattened = [] 48 | for value in operation_output: 49 | flattened.extend(flatten_operation_output(value)) 50 | return flattened 51 | 52 | 53 | def get_grad_fn(retval): 54 | if isinstance(retval, torch.Tensor) and retval.grad_fn is not None: 55 | return retval, retval.grad_fn 56 | elif isinstance(retval, tuple) or isinstance(retval, list): 57 | for inner_value in retval: 58 | inner_retval, grad_fn = get_grad_fn(inner_value) 59 | if grad_fn is not None: 60 | return inner_retval, grad_fn 61 | 62 | return None, None 63 | 64 | 65 | def get_accumulate_grad_inputs(initial_grad_fn, backward_runnable): 66 | input_dict = {} 67 | hook_handles = [] 68 | def get_hook(grad_fn): 69 | def hook(arg1, arg2): 70 | if not isinstance(arg2[0], torch.Tensor): 71 | return 72 | input_dict[grad_fn] = arg2[0].size() 73 | return hook 74 | 75 | # Traverse the graph to identify all AccumulateGrad functions 76 | stack = [initial_grad_fn] 77 | visited = {initial_grad_fn} 78 | 79 | while len(stack) > 0: 80 | grad_fn = stack.pop() 81 | 82 | if grad_fn.name() == 'torch::autograd::AccumulateGrad': 83 | hook_handles.append(grad_fn.register_hook(get_hook(grad_fn))) 84 | 85 | for next_grad_fn, _ in grad_fn.next_functions: 86 | if next_grad_fn is None or next_grad_fn in visited: 87 | continue 88 | stack.append(next_grad_fn) 89 | visited.add(next_grad_fn) 90 | 91 | # Run a backward pass to get accumulate grad sizes 92 | backward_runnable() 93 | torch.cuda.synchronize() 94 | 95 | # Clear hooks 96 | for handle in hook_handles: 97 | handle.remove() 98 | 99 | return input_dict 100 | -------------------------------------------------------------------------------- /analyzer/habitat/profiling/kernel.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import habitat.habitat_cuda as hc 3 | 4 | from habitat.analysis import SPECIAL_OPERATIONS 5 | from habitat.analysis.metrics import resolve_metrics 6 | from habitat.analysis.kernels import MeasuredKernel 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class KernelProfiler: 12 | def __init__(self, device, metrics=None, metrics_threshold_ms=0): 13 | self._device = device 14 | self._metrics = resolve_metrics(metrics, self._device) 15 | self._metrics_threshold_ns = metrics_threshold_ms * 1000000 16 | 17 | def measure_kernels(self, runnable, func_name=None): 18 | """ 19 | Uses CUPTI to measure the kernels launched by runnable. 20 | 21 | Returns: 22 | A list of MeasuredKernels 23 | """ 24 | if func_name is None: 25 | fname = ( 26 | runnable.__name__ if hasattr(runnable, "__name__") 27 | else "Unnamed" 28 | ) 29 | else: 30 | fname = func_name 31 | 32 | return list(map( 33 | lambda ks: MeasuredKernel(ks[0], ks[1], self._device), 34 | self._measure_kernels_raw(runnable, fname) 35 | )) 36 | 37 | def _measure_kernels_raw(self, runnable, func_name): 38 | """ 39 | Uses CUPTI to measure the kernels launched by runnable. 40 | 41 | Returns: 42 | A list of tuples, where 43 | - tuple[0] is the raw kernel measurement that should be used for 44 | the kernel's run time 45 | - tuple[1] is a list of the raw kernel measurements that contain 46 | the metrics requested 47 | """ 48 | time_kernels = hc.profile(runnable) 49 | if (len(self._metrics) == 0 or 50 | func_name in SKIP_METRICS or 51 | func_name in SPECIAL_OPERATIONS or 52 | self._under_threshold(time_kernels)): 53 | return list(map(lambda tk: (tk, []), time_kernels)) 54 | 55 | try: 56 | metric_kernels = [ 57 | hc.profile(runnable, metric) for metric in self._metrics 58 | ] 59 | # Make sure the same number of kernels are recorded for each metric 60 | assert all(map( 61 | lambda ks: len(ks) == len(metric_kernels[0]), 62 | metric_kernels, 63 | )) 64 | # metric_kernels is originally (# metrics x # kernels in op) 65 | # we need to transpose it to become (# kernels in op x # metrics) 66 | # so that we can join kernels with their metrics. 67 | transposed = map(list, zip(*metric_kernels)) 68 | # We return a list of (time kernel, [metric kernels]) 69 | return list(zip(time_kernels, transposed)) 70 | except RuntimeError as ex: 71 | logger.warn( 72 | 'Metrics error "%s" for function "%s".', 73 | str(ex), 74 | func_name, 75 | ) 76 | return list(map(lambda tk: (tk, []), time_kernels)) 77 | 78 | def _under_threshold(self, kernels): 79 | # If under threshold, don't measure metrics 80 | return ( 81 | sum(map(lambda k: k.run_time_ns, kernels)) 82 | <= self._metrics_threshold_ns 83 | ) 84 | 85 | 86 | SKIP_METRICS = { 87 | "detach_", 88 | } 89 | -------------------------------------------------------------------------------- /analyzer/habitat/profiling/run_time.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class RunTimeProfiler: 5 | def __init__(self, warm_up=3, measure_for=3): 6 | self._warm_up = warm_up 7 | self._measure_for = measure_for 8 | self._start_event = torch.cuda.Event(enable_timing=True) 9 | self._end_event = torch.cuda.Event(enable_timing=True) 10 | 11 | def measure_ms(self, runnable): 12 | for _ in range(self._warm_up): 13 | runnable() 14 | 15 | self._start_event.record() 16 | for _ in range(self._measure_for): 17 | runnable() 18 | self._end_event.record() 19 | torch.cuda.synchronize() 20 | 21 | return ( 22 | self._start_event.elapsed_time(self._end_event) / self._measure_for 23 | ) 24 | -------------------------------------------------------------------------------- /analyzer/habitat/tracking/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geoffxy/habitat/5f01e523a1dc30dbfbaaa39cf4880a534c7781a2/analyzer/habitat/tracking/__init__.py -------------------------------------------------------------------------------- /analyzer/habitat/tracking/base.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | 3 | 4 | class TrackerBase: 5 | def __init__(self): 6 | self._is_tracking = False 7 | 8 | @contextlib.contextmanager 9 | def track(self): 10 | self.start_tracking() 11 | try: 12 | yield self 13 | finally: 14 | self.stop_tracking() 15 | 16 | def start_tracking(self): 17 | self._is_tracking = True 18 | 19 | def stop_tracking(self): 20 | self._is_tracking = False 21 | -------------------------------------------------------------------------------- /analyzer/habitat/tracking/callable.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | import torch 4 | 5 | from habitat.tracking.base import TrackerBase 6 | from habitat.tracking.hook_manager import HookManager 7 | 8 | 9 | class CallableTracker(TrackerBase): 10 | def __init__(self, hook_creator): 11 | super().__init__() 12 | self._hook_manager = HookManager() 13 | self._hook_creator = hook_creator 14 | 15 | def start_tracking(self): 16 | super().start_tracking() 17 | self._hook_manager.attach_hooks_on_module( 18 | torch, 19 | lambda fn: _is_callable_and_public(fn) and \ 20 | fn.__name__ not in BLACKLISTED_TORCH_METHODS, 21 | self._hook_creator, 22 | ) 23 | self._hook_manager.attach_hooks_on_module( 24 | torch.Tensor, 25 | lambda fn: _is_callable_and_public(fn) and \ 26 | fn.__name__ != 'backward' and \ 27 | fn.__name__ not in BLACKLISTED_TENSOR_METHODS, 28 | self._hook_creator, 29 | ) 30 | self._hook_manager.attach_hooks_on_module( 31 | torch.Tensor, 32 | _is_callable_dunder, 33 | self._hook_creator, 34 | ) 35 | self._hook_manager.attach_hooks_on_module( 36 | torch.nn.functional, 37 | _is_callable_and_public, 38 | self._hook_creator, 39 | ) 40 | self._hook_manager.attach_hooks_on_module_using( 41 | torch.nn._VF, 42 | torch._C._VariableFunctions, 43 | _is_callable_and_public, 44 | self._hook_creator, 45 | ) 46 | 47 | def stop_tracking(self): 48 | super().stop_tracking() 49 | self._hook_manager.remove_hooks() 50 | 51 | 52 | def _is_callable_and_public(maybe_fn): 53 | # By convention, _ prefixed functions in Python should not be 54 | # called by users (i.e. they are "private" functions) 55 | return _is_callable(maybe_fn) and maybe_fn.__name__[0] != '_' 56 | 57 | # Original source of these blacklists: 58 | # https://github.com/NVIDIA/apex/blob/master/apex/pyprof/nvtx/nvmarker.py 59 | BLACKLISTED_DUNDERS = { 60 | '__all__', 61 | '__array__', 62 | '__array_priority__', 63 | '__array_wrap__', 64 | '__bool__', 65 | '__builtins__', 66 | '__cached__', 67 | '__class__', 68 | '__deepcopy__', 69 | '__delattr__', 70 | '__delitem__', 71 | '__dict__', 72 | '__dir__', 73 | '__doc__', 74 | '__file__', 75 | '__format__', 76 | '__getattribute__', 77 | '__getitem__', 78 | '__hash__', 79 | '__index__', 80 | '__init__', 81 | '__init_subclass__', 82 | '__iter__', 83 | '__len__', 84 | '__loader__', 85 | '__module__', 86 | '__name__', 87 | '__new__', 88 | '__nonzero__', 89 | '__package__', 90 | '__path__', 91 | '__reduce__', 92 | '__reduce_ex__', 93 | '__repr__', 94 | '__reversed__', 95 | '__setattr__', 96 | '__setitem__', 97 | '__setstate__', 98 | '__sizeof__', 99 | '__spec__', 100 | '__str__', 101 | '__subclasshook__', 102 | '__version__', 103 | '__weakref__', 104 | } 105 | 106 | BLACKLISTED_TENSOR_METHODS = { 107 | 'size', 'dim', 'item', 'tolist', 108 | } 109 | 110 | BLACKLISTED_TORCH_METHODS = { 111 | 'is_storage', 112 | } 113 | 114 | 115 | def _is_callable_dunder(maybe_fn): 116 | """ 117 | Returns True if maybe_fn is a callable dunder (callable named with double 118 | underscores) (e.g., __add__) 119 | """ 120 | return ( 121 | _is_callable(maybe_fn) and 122 | len(maybe_fn.__name__) > 4 and 123 | maybe_fn.__name__[:2] == '__' and 124 | maybe_fn.__name__[-2:] == '__' and 125 | maybe_fn.__name__ not in BLACKLISTED_DUNDERS 126 | ) 127 | 128 | 129 | def _is_callable(maybe_fn): 130 | return ( 131 | inspect.isfunction(maybe_fn) or 132 | inspect.ismethod(maybe_fn) or 133 | inspect.isbuiltin(maybe_fn) or 134 | inspect.isroutine(maybe_fn) 135 | ) 136 | -------------------------------------------------------------------------------- /analyzer/habitat/tracking/hook_manager.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class HookManager: 4 | def __init__(self): 5 | self._original_callables = {} 6 | 7 | def attach_hooks_on_module(self, module, predicate, hook_creator): 8 | self.attach_hooks_on_module_using( 9 | module, module, predicate, hook_creator) 10 | 11 | def attach_hooks_on_module_using( 12 | self, module, using_module, predicate, hook_creator): 13 | """ 14 | Attach hooks onto functions in the provided module. Use the 15 | `using_module` to discover the existing functions. 16 | """ 17 | for prop in dir(using_module): 18 | if not predicate(getattr(module, prop)): 19 | continue 20 | self.attach_hook(module, prop, hook_creator) 21 | 22 | def attach_hook(self, module, prop, hook_creator): 23 | target = getattr(module, prop) 24 | self._maybe_store_callable(module, prop, target) 25 | setattr(module, prop, hook_creator(target)) 26 | 27 | def remove_hooks(self): 28 | for module, callable_pairs in self._original_callables.items(): 29 | for prop, original_callable in callable_pairs.items(): 30 | setattr(module, prop, original_callable) 31 | self._original_callables.clear() 32 | 33 | def _maybe_store_callable(self, module, prop, original_callable): 34 | """ 35 | Store the original callable (to be able to restore it) only when it is 36 | the first time we are encountering the given callable. 37 | """ 38 | if module not in self._original_callables: 39 | self._original_callables[module] = {} 40 | 41 | if prop in self._original_callables[module]: 42 | return 43 | 44 | self._original_callables[module][prop] = original_callable 45 | -------------------------------------------------------------------------------- /analyzer/habitat/tracking/operation.py: -------------------------------------------------------------------------------- 1 | from habitat.analysis import SPECIAL_OPERATIONS 2 | from habitat.analysis.arguments import Arguments 3 | from habitat.analysis.operation import MeasuredOperation 4 | from habitat.analysis.trace import Trace 5 | from habitat.profiling.operation import OperationProfiler 6 | from habitat.tracking.base import TrackerBase 7 | from habitat.tracking.callable import CallableTracker 8 | 9 | 10 | class OperationTracker(TrackerBase): 11 | def __init__(self, device, metrics=None, metrics_threshold_ms=0): 12 | super().__init__() 13 | self._device = device 14 | self._callable_tracker = CallableTracker(self._hook_creator) 15 | self._profiler = OperationProfiler( 16 | device, 17 | metrics, 18 | metrics_threshold_ms, 19 | ) 20 | self._processing_hook = False 21 | 22 | self._operations = [] 23 | 24 | def start_tracking(self): 25 | super().start_tracking() 26 | self._callable_tracker.start_tracking() 27 | 28 | def stop_tracking(self): 29 | super().stop_tracking() 30 | self._callable_tracker.stop_tracking() 31 | 32 | def get_tracked_trace(self): 33 | return Trace(self._device, self._operations) 34 | 35 | def _hook_creator(self, func): 36 | def hook(*args, **kwargs): 37 | # NOTE: We use self._processing_hook to handle cases where we have 38 | # hooks on nested function calls. 39 | if self._processing_hook: 40 | return func(*args, **kwargs) 41 | 42 | self._processing_hook = True 43 | try: 44 | # We only track the arguments if the operation is "special" 45 | # (i.e. we use special handling to scale it to a different 46 | # device). 47 | is_special_op = func.__name__ in SPECIAL_OPERATIONS 48 | arguments = ( 49 | Arguments.from_raw_arguments(args, kwargs) 50 | if is_special_op else None 51 | ) 52 | 53 | if (func.__name__ == 'lstm' and 54 | isinstance(arguments.args[4], bool)): 55 | # Special case - we need this information for the lstm 56 | # operation 57 | arguments.special['batch_sizes'] = args[1].tolist() 58 | 59 | forward, backward = self._profiler.measure_operation( 60 | func, 61 | args, 62 | kwargs, 63 | ) 64 | self._operations.append(MeasuredOperation( 65 | name=func.__name__, 66 | arguments=arguments, 67 | forward=forward, 68 | backward=backward, 69 | device=self._device, 70 | )) 71 | 72 | # Actually run the hooked function 73 | return func(*args, **kwargs) 74 | finally: 75 | self._processing_hook = False 76 | 77 | return hook 78 | -------------------------------------------------------------------------------- /analyzer/habitat/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import yaml 4 | 5 | 6 | def set_up_logging(): 7 | logging.basicConfig( 8 | level=logging.INFO, 9 | format='%(asctime)s %(levelname)-8s %(message)s', 10 | datefmt='%Y-%m-%d %H:%M', 11 | ) 12 | 13 | 14 | def add_common_cmd_args(parser): 15 | parser.add_argument('model_path', help='The serialized model to analyze') 16 | parser.add_argument( 17 | 'model_config_path', 18 | help='The configuration file for the model', 19 | ) 20 | parser.add_argument( 21 | '--device-config', 22 | type=str, 23 | default='devices.yml', 24 | help='The config file containing GPU device properties.', 25 | ) 26 | parser.add_argument( 27 | '--origin-device', 28 | type=str, 29 | required=True, 30 | help='The GPU on which the analysis is being performed.', 31 | ) 32 | parser.add_argument( 33 | '--kernel-lut', 34 | type=str, 35 | default=os.path.join('lutfiles', 'kernels.sqlite'), 36 | help='The path to the kernel metadata look up table.', 37 | ) 38 | parser.add_argument( 39 | '--operation-lut', 40 | type=str, 41 | default=os.path.join('lutfiles', 'operations.sqlite'), 42 | help='The path to the operation run time look up table.', 43 | ) 44 | 45 | 46 | def ns_to_ms(ns): 47 | return ns / 1e6 48 | 49 | 50 | def ms_to_ns(ms): 51 | return ms * 1e6 52 | 53 | 54 | def name_all_arguments(all_parameters, args, kwargs): 55 | """ 56 | This function merges positional and keyword arguments 57 | into one dictionary based on the declared names of the 58 | function's parameters. 59 | """ 60 | merged = {**kwargs} 61 | for arg_name, arg in zip(all_parameters, args): 62 | merged[arg_name] = arg 63 | return merged 64 | -------------------------------------------------------------------------------- /analyzer/install-dev.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | SO_NAME="habitat_cuda.cpython-36m-x86_64-linux-gnu.so" 4 | PACKAGE_NAME="habitat-predict" 5 | 6 | # Operate out of the script directory 7 | SCRIPT_PATH=$(cd $(dirname $0) && pwd -P) 8 | cd $SCRIPT_PATH 9 | 10 | # Abort if an error occurs 11 | set -e 12 | 13 | function pushd() { 14 | command pushd "$@" > /dev/null 15 | } 16 | 17 | function popd() { 18 | command popd "$@" > /dev/null 19 | } 20 | 21 | function compile_habitat_cuda() { 22 | echo "Compiling the Habitat C++ extension..." 23 | pushd ../cpp 24 | mkdir -p build 25 | pushd build 26 | 27 | cmake -DCMAKE_BUILD_TYPE=Release .. 28 | make -j 8 habitat_cuda 29 | 30 | if [ ! -f $SO_NAME ]; then 31 | echo "ERROR: Could not find $SO_NAME after compilation. Please double " 32 | echo "check that compilation completed successfully." 33 | exit 1 34 | fi 35 | 36 | popd 37 | popd 38 | echo "" 39 | } 40 | 41 | function symlink_habitat_cuda() { 42 | echo "Adding a symbolic link to the Habitat C++ extension..." 43 | if [ ! -h habitat/$SO_NAME ]; then 44 | ln -s ../../cpp/build/$SO_NAME habitat 45 | fi 46 | echo "" 47 | } 48 | 49 | function install_habitat() { 50 | echo "Install an editable version of the Habitat package..." 51 | pip3 install --editable . 52 | echo "" 53 | } 54 | 55 | function uninstall_habitat() { 56 | pip3 uninstall $PACKAGE_NAME 57 | } 58 | 59 | function check_prereqs() { 60 | if [ -z $(which cmake) ]; then 61 | echo "Please ensure cmake 3.17+ is installed." 62 | exit 1 63 | fi 64 | if [ -z $(which make) ]; then 65 | echo "Please ensure make is installed." 66 | fi 67 | if [ -z $(which pip3) ]; then 68 | echo "Please ensure pip3 is installed." 69 | exit 1 70 | fi 71 | } 72 | 73 | function main() { 74 | if [ "$1" = "--uninstall" ]; then 75 | uninstall_habitat 76 | else 77 | check_prereqs 78 | compile_habitat_cuda 79 | symlink_habitat_cuda 80 | install_habitat 81 | fi 82 | } 83 | 84 | main $@ 85 | -------------------------------------------------------------------------------- /analyzer/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=40.6.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /analyzer/setup.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import os 3 | import re 4 | import sys 5 | 6 | from setuptools import setup, find_packages 7 | 8 | # Acknowledgement: This setup.py was adapted from Hynek Schlawack's Python 9 | # Packaging Guide 10 | # https://hynek.me/articles/sharing-your-labor-of-love-pypi-quick-and-dirty 11 | 12 | ################################################################### 13 | 14 | NAME = "habitat-predict" 15 | PACKAGES = find_packages() 16 | META_PATH = os.path.join("habitat", "__init__.py") 17 | README_PATH = "README.md" 18 | PYTHON_REQUIRES = ">=3.6" 19 | 20 | PACKAGE_DATA = { 21 | "habitat": [ 22 | "data/hints.yml", 23 | "data/bmm/model.pth", 24 | "data/conv2d/model.pth", 25 | "data/kernels.sqlite", 26 | "data/linear/model.pth", 27 | "data/lstm/model.pth", 28 | "habitat_cuda.cpython-36m-x86_64-linux-gnu.so", 29 | ], 30 | } 31 | 32 | INSTALL_REQUIRES = [ 33 | # "pyyaml", 34 | # "torch>=1.4.0", 35 | "pandas>=1.1.2", 36 | "tqdm>=4.49.0" 37 | ] 38 | 39 | KEYWORDS = [ 40 | "neural networks", 41 | "pytorch", 42 | "performance", 43 | "profiler", 44 | "predictions", 45 | ] 46 | 47 | CLASSIFIERS = [ 48 | "Do Not Upload", 49 | "Development Status :: 3 - Alpha", 50 | "Intended Audience :: Developers", 51 | "License :: OSI Approved :: Apache Software License", 52 | "Programming Language :: Python :: 3 :: Only", 53 | ] 54 | 55 | ################################################################### 56 | 57 | HERE = os.path.abspath(os.path.dirname(__file__)) 58 | 59 | 60 | def read(*parts): 61 | """ 62 | Build an absolute path from *parts* and return the contents of the 63 | resulting file. Assume UTF-8 encoding. 64 | """ 65 | with codecs.open(os.path.join(HERE, *parts), "rb", "utf-8") as f: 66 | return f.read() 67 | 68 | 69 | META_FILE = read(META_PATH) 70 | 71 | 72 | def find_meta(meta): 73 | """ 74 | Extract __*meta*__ from META_FILE. 75 | """ 76 | meta_match = re.search( 77 | r"^__{meta}__ = ['\"]([^'\"]*)['\"]".format(meta=meta), 78 | META_FILE, re.M 79 | ) 80 | if meta_match: 81 | return meta_match.group(1) 82 | raise RuntimeError("Unable to find __{meta}__ string.".format(meta=meta)) 83 | 84 | 85 | if __name__ == "__main__": 86 | setup( 87 | name=NAME, 88 | description=find_meta("description"), 89 | license=find_meta("license"), 90 | version=find_meta("version"), 91 | author=find_meta("author"), 92 | author_email=find_meta("email"), 93 | maintainer=find_meta("author"), 94 | maintainer_email=find_meta("email"), 95 | long_description=read(README_PATH), 96 | long_description_content_type="text/markdown", 97 | packages=PACKAGES, 98 | package_data=PACKAGE_DATA, 99 | python_requires=PYTHON_REQUIRES, 100 | install_requires=INSTALL_REQUIRES, 101 | classifiers=CLASSIFIERS, 102 | keywords=KEYWORDS, 103 | ) 104 | -------------------------------------------------------------------------------- /cpp/.gitignore: -------------------------------------------------------------------------------- 1 | # CMake build directories 2 | build 3 | debug 4 | cmake-build-debug 5 | 6 | # CLion project directory 7 | .idea 8 | -------------------------------------------------------------------------------- /cpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.17 FATAL_ERROR) 2 | project(habitat LANGUAGES C CXX CUDA) 3 | 4 | # Include our custom find module files 5 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") 6 | 7 | # Find all dependencies 8 | find_package(CUDAToolkit REQUIRED) 9 | # We need CUPTI here because CUDAToolkit does not include the nvperf-related 10 | # libraries and headers that we need. 11 | find_package(CUPTI REQUIRED) 12 | find_package(NVPerf REQUIRED) 13 | add_subdirectory(external) 14 | 15 | ################################################################################ 16 | # Habitat Targets 17 | ################################################################################ 18 | 19 | # habitat_cuda: Builds a Python importable module that provides bindings to the 20 | # CUDA-related functionality used by Habitat. 21 | set(HabitatCUDA "habitat_cuda") 22 | pybind11_add_module(${HabitatCUDA} src/habitat_cuda.cpp) 23 | set_property(TARGET ${HabitatCUDA} PROPERTY CXX_STANDARD 11) 24 | 25 | # device_info: Builds a utility executable that prints information about the 26 | # underlying GPU device (e.g., number of SMs, memory bandwidth). 27 | set(DeviceInfo "device_info") 28 | add_executable(${DeviceInfo} src/device_info.cpp) 29 | set_property(TARGET ${DeviceInfo} PROPERTY CXX_STANDARD 11) 30 | 31 | ################################################################################ 32 | 33 | # Add our source files 34 | cmake_policy(SET CMP0076 NEW) 35 | add_subdirectory(src) 36 | 37 | # Specify dependencies for each target 38 | target_link_libraries(${HabitatCUDA} PRIVATE habitat-cuda-lib) 39 | set(HabitatCUDATransitiveDeps habitat-cuda-lib cupti_profilerhost_util) 40 | target_link_libraries(${DeviceInfo} PRIVATE CUDA::cudart gflags::gflags) 41 | 42 | # Since pybind11 modules are shared libraries, all the static libraries it 43 | # depends on must be compiled as position independent code. 44 | foreach(LIB ${HabitatCUDATransitiveDeps}) 45 | set_property(TARGET ${LIB} PROPERTY POSITION_INDEPENDENT_CODE ON) 46 | endforeach() 47 | 48 | # Turn on all compile warnings 49 | set(AllTargets ${HabitatCUDA}) 50 | foreach(TGT ${AllTargets}) 51 | if(CMAKE_COMPILER_IS_GNUCC) 52 | target_compile_options(${TGT} PRIVATE "-Wall") 53 | endif() 54 | endforeach() 55 | -------------------------------------------------------------------------------- /cpp/README.md: -------------------------------------------------------------------------------- 1 | Habitat C++ Sources 2 | =================== 3 | This directory contains the C++ source code for Habitat. All the C++ code is 4 | kept in this unified directory to simplify code sharing. CMake is used to 5 | compile the code. 6 | 7 | The C++ code is currently used to build two targets: 8 | 9 | - `habitat_cuda`: A Python-importable module that provides bindings to the 10 | CUDA-related functionality used by Habitat. 11 | - `device_info`: A utility that prints information about the underlying GPU 12 | (e.g., number of SMs, memory bandwidth, etc.). 13 | 14 | Code Organization 15 | ----------------- 16 | Each target corresponds to one file under the `src` directory. The rest of the 17 | supporting code is organized in subdirectories under `src`. 18 | 19 | If code is shared among multiple targets, it will likely be organized into a 20 | independently compiled library that can be linked to the needed targets. 21 | -------------------------------------------------------------------------------- /cpp/cmake/FindCUPTI.cmake: -------------------------------------------------------------------------------- 1 | #[=======================================================================[.rst: 2 | FindCUPTI 3 | --------- 4 | 5 | Finds the CUPTI library. 6 | 7 | To specify a custom location, set the ``CUPTI_DIR`` environment variable. 8 | This find module file will look under ``$CUPTI_DIR/include`` and ``$CUPTI_DIR/lib/x64`` 9 | for the headers and CUPTI shared library respectively. 10 | 11 | Result Variables 12 | ^^^^^^^^^^^^^^^^ 13 | 14 | This will define the following variables: 15 | 16 | ``CUPTI_FOUND`` 17 | True if the system has CUPTI. 18 | ``CUPTI_INCLUDE_DIRS`` 19 | Include directories needed to use CUPTI. 20 | ``CUPTI_LIBRARIES`` 21 | Libraries needed to link to CUPTI. 22 | 23 | Cache Variables 24 | ^^^^^^^^^^^^^^^ 25 | 26 | The following cache variables may also be set: 27 | 28 | ``CUPTI_INCLUDE_DIR`` 29 | The directory containing ``cupti.h``. 30 | ``CUPTI_LIBRARY`` 31 | The path to the CUPTI library. 32 | 33 | #]=======================================================================] 34 | 35 | include(FindPackageHandleStandardArgs) 36 | 37 | if(DEFINED ENV{CUDA_HOME}) 38 | SET(CUPTI_CUDA_HOME "$ENV{CUDA_HOME}") 39 | endif() 40 | 41 | if(DEFINED ENV{CUPTI_DIR}) 42 | SET(CUPTI_DIR "$ENV{CUPTI_DIR}") 43 | endif() 44 | 45 | find_path(CUPTI_INCLUDE_DIR cupti.h 46 | HINTS 47 | ${CUPTI_DIR}/include 48 | ${CUPTI_CUDA_HOME}/extras/CUPTI/include 49 | /usr/local/cuda/extras/CUPTI/include 50 | ) 51 | 52 | find_library(CUPTI_LIBRARY cupti 53 | HINTS 54 | ${CUPTI_DIR}/lib/x64 55 | ${CUPTI_CUDA_HOME}/extras/CUPTI/lib64 56 | /usr/local/cuda/extras/CUPTI/lib64 57 | ) 58 | 59 | find_package_handle_standard_args(CUPTI 60 | DEFAULT_MSG 61 | CUPTI_INCLUDE_DIR 62 | CUPTI_LIBRARY 63 | ) 64 | 65 | if(CUPTI_FOUND) 66 | set(CUPTI_INCLUDE_DIRS ${CUPTI_INCLUDE_DIR}) 67 | set(CUPTI_LIBRARIES ${CUPTI_LIBRARY}) 68 | 69 | message(STATUS "Found CUPTI includes: ${CUPTI_INCLUDE_DIRS}") 70 | message(STATUS "Found CUPTI library: ${CUPTI_LIBRARIES}") 71 | endif() 72 | -------------------------------------------------------------------------------- /cpp/cmake/FindNVPerf.cmake: -------------------------------------------------------------------------------- 1 | #[=======================================================================[.rst: 2 | FindNVPerf 3 | --------- 4 | 5 | Finds the NVPerf library. 6 | 7 | Note that NVPerf is usually distributed with CUPTI. Therefore to specify a 8 | custom location, set the ``CUPTI_DIR`` environment variable. This find module 9 | file will look under ``$CUPTI_DIR/lib/x64`` for the NVPerf shared library. 10 | 11 | Result Variables 12 | ^^^^^^^^^^^^^^^^ 13 | 14 | This will define the following variables: 15 | 16 | ``NVPerf_FOUND`` 17 | True if the system has NVPerf. 18 | ``NVPerf_LIBRARIES`` 19 | Libraries needed to link to NVPerf. 20 | 21 | Cache Variables 22 | ^^^^^^^^^^^^^^^ 23 | 24 | The following cache variables may also be set: 25 | 26 | ``NVPerf_HOST_LIBRARY`` 27 | The path to the NVPerf host library. 28 | ``NVPerf_TARGET_LIBRARY`` 29 | The path to the NVPerf target library. 30 | 31 | #]=======================================================================] 32 | 33 | include(FindPackageHandleStandardArgs) 34 | 35 | if(DEFINED ENV{CUDA_HOME}) 36 | SET(CUPTI_CUDA_HOME "$ENV{CUDA_HOME}") 37 | endif() 38 | 39 | if(DEFINED ENV{CUPTI_DIR}) 40 | SET(CUPTI_DIR "$ENV{CUPTI_DIR}") 41 | endif() 42 | 43 | find_library(NVPerf_HOST_LIBRARY nvperf_host 44 | HINTS 45 | ${CUPTI_DIR}/lib/x64 46 | ${CUPTI_CUDA_HOME}/extras/CUPTI/lib64 47 | /usr/local/cuda/extras/CUPTI/lib64 48 | ) 49 | 50 | find_library(NVPerf_TARGET_LIBRARY nvperf_target 51 | HINTS 52 | ${CUPTI_DIR}/lib/x64 53 | ${CUPTI_CUDA_HOME}/extras/CUPTI/lib64 54 | /usr/local/cuda/extras/CUPTI/lib64 55 | ) 56 | 57 | find_package_handle_standard_args(NVPerf 58 | DEFAULT_MSG 59 | NVPerf_HOST_LIBRARY 60 | NVPerf_TARGET_LIBRARY 61 | ) 62 | 63 | if(NVPerf_FOUND) 64 | set(NVPerf_LIBRARIES ${NVPerf_HOST_LIBRARY} ${NVPerf_TARGET_LIBRARY}) 65 | message(STATUS "Found NVPerf libraries: ${NVPerf_LIBRARIES}") 66 | endif() 67 | -------------------------------------------------------------------------------- /cpp/external/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(cupti_profilerhost_util) 2 | add_subdirectory(gflags) 3 | add_subdirectory(pybind11) 4 | -------------------------------------------------------------------------------- /cpp/external/cupti_profilerhost_util/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(CuptiProfilerHost "cupti_profilerhost_util") 2 | 3 | add_library(${CuptiProfilerHost} 4 | src/profilerhost_util/Eval.cpp 5 | src/profilerhost_util/List.cpp 6 | src/profilerhost_util/Metric.cpp 7 | ) 8 | 9 | target_link_libraries(${CuptiProfilerHost} 10 | PRIVATE 11 | CUDA::cupti 12 | ${NVPerf_LIBRARIES} 13 | ) 14 | 15 | target_include_directories(${CuptiProfilerHost} 16 | PUBLIC 17 | ${CMAKE_CURRENT_SOURCE_DIR}/include/c_util 18 | ${CMAKE_CURRENT_SOURCE_DIR}/include/profilerhost_util 19 | PRIVATE 20 | # We need to add this in addition to the CUDA::cupti 21 | # target above because this variable includes the 22 | # nvperf headers (and the CUDA::cupti target does not). 23 | ${CUPTI_INCLUDE_DIRS} 24 | ) 25 | -------------------------------------------------------------------------------- /cpp/external/cupti_profilerhost_util/include/c_util/FileOp.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | bool WriteBinaryFile(const char* pFileName, const std::vector& data) 6 | { 7 | FILE* fp = fopen(pFileName, "wb"); 8 | if (fp) 9 | { 10 | if (data.size()) 11 | { 12 | fwrite(&data[0], 1, data.size(), fp); 13 | } 14 | fclose(fp); 15 | } 16 | else 17 | { 18 | std::cout << "Failed to open " << pFileName << "\n"; 19 | fclose(fp); 20 | return false; 21 | } 22 | return true; 23 | } 24 | 25 | bool ReadBinaryFile(const char* pFileName, std::vector& image) 26 | { 27 | FILE* fp = fopen(pFileName, "rb"); 28 | if (!fp) 29 | { 30 | std::cout << "Failed to open " << pFileName << "\n"; 31 | return false; 32 | } 33 | 34 | fseek(fp, 0, SEEK_END); 35 | const long fileLength = ftell(fp); 36 | fseek(fp, 0, SEEK_SET); 37 | if (!fileLength) 38 | { 39 | std::cout << pFileName << " has zero length\n"; 40 | fclose(fp); 41 | return false; 42 | } 43 | 44 | image.resize((size_t)fileLength); 45 | fread(&image[0], 1, image.size(), fp); 46 | fclose(fp); 47 | return true; 48 | } -------------------------------------------------------------------------------- /cpp/external/cupti_profilerhost_util/include/c_util/ScopeExit.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | template 4 | 5 | class ScopeExit 6 | { 7 | public: 8 | ScopeExit(T t) : t(t) {} 9 | ~ScopeExit() { t(); } 10 | T t; 11 | }; 12 | 13 | template 14 | ScopeExit MoveScopeExit(T t) { 15 | return ScopeExit(t); 16 | }; 17 | 18 | #define NV_ANONYMOUS_VARIABLE_DIRECT(name, line) name##line 19 | #define NV_ANONYMOUS_VARIABLE_INDIRECT(name, line) NV_ANONYMOUS_VARIABLE_DIRECT(name, line) 20 | 21 | #define SCOPE_EXIT(func) const auto NV_ANONYMOUS_VARIABLE_INDIRECT(EXIT, __LINE__) = MoveScopeExit([=](){func();}) 22 | -------------------------------------------------------------------------------- /cpp/external/cupti_profilerhost_util/include/profilerhost_util/Eval.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace NV { 8 | namespace Metric { 9 | namespace Eval { 10 | struct MetricNameValue { 11 | std::string metricName; 12 | int numRanges; 13 | // pair 14 | std::vector < std::pair > rangeNameMetricValueMap; 15 | }; 16 | 17 | 18 | /* Function to get aggregate metric value 19 | * @param[in] chipName Chip name for which to get metric values 20 | * @param[in] counterDataImage Counter data image 21 | * @param[in] metricNames List of metrics to read from counter data image 22 | * @param[out] metricNameValueMap Metric name value map 23 | */ 24 | bool GetMetricGpuValue(NVPA_MetricsContext* metricsContext, std::string chipName, std::vector counterDataImage, std::vector metricNames, std::vector& metricNameValueMap); 25 | 26 | bool PrintMetricValues(std::string chipName, std::vector counterDataImage, std::vector metricNames); 27 | 28 | } 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /cpp/external/cupti_profilerhost_util/include/profilerhost_util/List.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | namespace NV { 3 | namespace Metric { 4 | namespace Enum { 5 | // Function to print list of all supported chips 6 | bool ListSupportedChips(); 7 | 8 | /* Function to print list of all metrics for a given chip 9 | * @param[in] chipName Chip Name for which metrics are to be listed 10 | * @param[in] listSubMetrics Whether submetrics(Peak, PerCycle, PctOfPeak) are to be listed or not 11 | */ 12 | bool ListMetrics(const char* chipName, bool listSubMetrics); 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /cpp/external/cupti_profilerhost_util/include/profilerhost_util/Metric.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace NV { 8 | namespace Metric { 9 | namespace Config { 10 | /* Function to get Config image 11 | * @param[in] chipName Chip name for which configImage is to be generated 12 | * @param[in] metricNames List of metrics for which configImage is to be generated 13 | * @param[out] configImage Generated configImage 14 | */ 15 | bool GetConfigImage(NVPA_MetricsContext* metricsContext, std::string chipName, std::vector metricNames, std::vector& configImage); 16 | 17 | /* Function to get CounterDataPrefix image 18 | * @param[in] chipName Chip name for which counterDataImagePrefix is to be generated 19 | * @param[in] metricNames List of metrics for which counterDataImagePrefix is to be generated 20 | * @param[out] counterDataImagePrefix Generated counterDataImagePrefix 21 | */ 22 | bool GetCounterDataPrefixImage(NVPA_MetricsContext* metricsContext, std::string chipName, std::vector metricNames, std::vector& counterDataImagePrefix); 23 | } 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /cpp/external/cupti_profilerhost_util/include/profilerhost_util/Parser.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | namespace NV { 4 | namespace Metric { 5 | namespace Parser { 6 | inline bool ParseMetricNameString(const std::string& metricName, std::string* reqName, bool* isolated, bool* keepInstances) 7 | { 8 | std::string& name = *reqName; 9 | name = metricName; 10 | if (name.empty()) 11 | { 12 | return false; 13 | } 14 | 15 | // boost program_options sometimes inserts a \n between the metric name and a '&' at the end 16 | size_t pos = name.find('\n'); 17 | if (pos != std::string::npos) 18 | { 19 | name.erase(pos, 1); 20 | } 21 | 22 | // trim whitespace 23 | while (name.back() == ' ') 24 | { 25 | name.pop_back(); 26 | if (name.empty()) 27 | { 28 | return false; 29 | } 30 | } 31 | 32 | *keepInstances = false; 33 | if (name.back() == '+') 34 | { 35 | *keepInstances = true; 36 | name.pop_back(); 37 | if (name.empty()) 38 | { 39 | return false; 40 | } 41 | } 42 | 43 | *isolated = true; 44 | if (name.back() == '$') 45 | { 46 | name.pop_back(); 47 | if (name.empty()) 48 | { 49 | return false; 50 | } 51 | } 52 | else if (name.back() == '&') 53 | { 54 | *isolated = false; 55 | name.pop_back(); 56 | if (name.empty()) 57 | { 58 | return false; 59 | } 60 | } 61 | 62 | return true; 63 | } 64 | } 65 | } 66 | } -------------------------------------------------------------------------------- /cpp/external/cupti_profilerhost_util/src/profilerhost_util/List.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #define RETURN_IF_NVPW_ERROR(retval, actual) \ 8 | do { \ 9 | if (NVPA_STATUS_SUCCESS != actual) { \ 10 | fprintf(stderr, "FAILED: %s\n", #actual); \ 11 | return retval; \ 12 | } \ 13 | } while (0) 14 | 15 | namespace NV { 16 | namespace Metric { 17 | namespace Enum { 18 | bool ListSupportedChips() { 19 | NVPW_GetSupportedChipNames_Params getSupportedChipNames = { NVPW_GetSupportedChipNames_Params_STRUCT_SIZE }; 20 | RETURN_IF_NVPW_ERROR(false, NVPW_GetSupportedChipNames(&getSupportedChipNames)); 21 | std::cout << "\n Number of supported chips : " << getSupportedChipNames.numChipNames; 22 | std::cout << "\n List of supported chips : \n"; 23 | 24 | for (size_t i = 0; i < getSupportedChipNames.numChipNames; i++) { 25 | std::cout << " " << getSupportedChipNames.ppChipNames[i] << "\n"; 26 | } 27 | 28 | return true; 29 | } 30 | 31 | bool ListMetrics(const char* chip, bool listSubMetrics) { 32 | 33 | NVPW_CUDA_MetricsContext_Create_Params metricsContextCreateParams = { NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE }; 34 | metricsContextCreateParams.pChipName = chip; 35 | RETURN_IF_NVPW_ERROR(false, NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams)); 36 | 37 | NVPW_MetricsContext_Destroy_Params metricsContextDestroyParams = { NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE }; 38 | metricsContextDestroyParams.pMetricsContext = metricsContextCreateParams.pMetricsContext; 39 | SCOPE_EXIT([&]() { NVPW_MetricsContext_Destroy((NVPW_MetricsContext_Destroy_Params *)&metricsContextDestroyParams); }); 40 | 41 | NVPW_MetricsContext_GetMetricNames_Begin_Params getMetricNameBeginParams = { NVPW_MetricsContext_GetMetricNames_Begin_Params_STRUCT_SIZE }; 42 | getMetricNameBeginParams.pMetricsContext = metricsContextCreateParams.pMetricsContext; 43 | getMetricNameBeginParams.hidePeakSubMetrics = !listSubMetrics; 44 | getMetricNameBeginParams.hidePerCycleSubMetrics = !listSubMetrics; 45 | getMetricNameBeginParams.hidePctOfPeakSubMetrics = !listSubMetrics; 46 | RETURN_IF_NVPW_ERROR(false, NVPW_MetricsContext_GetMetricNames_Begin(&getMetricNameBeginParams)); 47 | 48 | NVPW_MetricsContext_GetMetricNames_End_Params getMetricNameEndParams = { NVPW_MetricsContext_GetMetricNames_End_Params_STRUCT_SIZE }; 49 | getMetricNameEndParams.pMetricsContext = metricsContextCreateParams.pMetricsContext; 50 | SCOPE_EXIT([&]() { NVPW_MetricsContext_GetMetricNames_End((NVPW_MetricsContext_GetMetricNames_End_Params *)&getMetricNameEndParams); }); 51 | 52 | std::cout << getMetricNameBeginParams.numMetrics << " metrics in total on the chip\n Metrics List : \n"; 53 | for (size_t i = 0; i < getMetricNameBeginParams.numMetrics; i++) { 54 | std::cout << getMetricNameBeginParams.ppMetricNames[i] << "\n"; 55 | } 56 | 57 | return true; 58 | } 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /cpp/src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(cuda) 2 | add_subdirectory(frontend) 3 | -------------------------------------------------------------------------------- /cpp/src/cuda/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # This library encapsulates Habitat's bindings to CUDA related utilities, such as CUPTI and the standalone 2 | # occupancy calculator. It should not be dependent on other Habitat libraries. 3 | add_library(habitat-cuda-lib 4 | diagnostics.cu 5 | cupti_exceptions.cpp 6 | cupti_manager.cpp 7 | cupti_profiler.cpp 8 | cupti_tracer.cpp 9 | kernel.cpp 10 | legacy_cupti_profiler.cpp 11 | new_cupti_profiler.cpp 12 | ) 13 | 14 | target_link_libraries(habitat-cuda-lib 15 | PRIVATE 16 | CUDA::cupti 17 | CUDA::cudart 18 | CUDA::cuda_driver 19 | cupti_profilerhost_util 20 | ${NVPerf_LIBRARIES} 21 | ) 22 | 23 | target_include_directories(habitat-cuda-lib 24 | PRIVATE 25 | ${CUPTI_INCLUDE_DIRS} 26 | ) 27 | 28 | target_compile_options( 29 | habitat-cuda-lib 30 | PRIVATE 31 | $<$:-use_fast_math> 32 | "$<$:SHELL:-gencode arch=compute_60,code=sm_60>" 33 | "$<$:SHELL:-gencode arch=compute_61,code=sm_61>" 34 | "$<$:SHELL:-gencode arch=compute_70,code=sm_70>" 35 | "$<$:SHELL:-gencode arch=compute_75,code=sm_75>" 36 | ) 37 | -------------------------------------------------------------------------------- /cpp/src/cuda/cuda_macros.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | // This header should only be included in source files (i.e. .cpp files) that contain CUDA API calls. 7 | 8 | #define RUNTIME_API_CALL(apiFuncCall) \ 9 | do { \ 10 | cudaError_t _status = apiFuncCall; \ 11 | if (_status != cudaSuccess) { \ 12 | fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ 13 | __FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\ 14 | throw std::runtime_error("CUDA Runtime API call failed."); \ 15 | } \ 16 | } while (0) 17 | -------------------------------------------------------------------------------- /cpp/src/cuda/cupti_exceptions.cpp: -------------------------------------------------------------------------------- 1 | #include "cupti_exceptions.h" 2 | 3 | #include 4 | 5 | namespace habitat { 6 | namespace cuda { 7 | 8 | CuptiError::CuptiError(CUptiResult error_code, const char* error_message) 9 | : std::runtime_error(std::string(error_message)), 10 | error_code_(error_code) {} 11 | 12 | CuptiError CuptiError::from(CUptiResult error_code) { 13 | const char* message; 14 | cuptiGetResultString(error_code, &message); 15 | return CuptiError(error_code, message); 16 | } 17 | 18 | } 19 | } 20 | 21 | -------------------------------------------------------------------------------- /cpp/src/cuda/cupti_exceptions.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace habitat { 7 | namespace cuda { 8 | 9 | class CuptiError : public std::runtime_error { 10 | public: 11 | static CuptiError from(CUptiResult error_code); 12 | 13 | CUptiResult errorCode() const { 14 | return error_code_; 15 | } 16 | 17 | private: 18 | CuptiError(CUptiResult error_code, const char* error_message); 19 | CUptiResult error_code_; 20 | }; 21 | 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /cpp/src/cuda/cupti_macros.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "cupti_exceptions.h" 8 | 9 | // This header should only be included in source files (i.e. .cpp files) that contain CUPTI API calls. 10 | 11 | #define CUPTI_CALL(call) \ 12 | do { \ 13 | CUptiResult _status = call; \ 14 | if (_status != CUPTI_SUCCESS) { \ 15 | const char* message; \ 16 | cuptiGetResultString(_status, &message); \ 17 | fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \ 18 | __FILE__, __LINE__, #call, message); \ 19 | throw habitat::cuda::CuptiError::from(_status); \ 20 | } \ 21 | } while (0) 22 | -------------------------------------------------------------------------------- /cpp/src/cuda/cupti_manager.cpp: -------------------------------------------------------------------------------- 1 | #include "habitat_cupti.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "cupti_profiler.h" 11 | 12 | namespace habitat { 13 | namespace cuda { 14 | 15 | CuptiManager::CuptiManager() 16 | : profiler_(CuptiProfiler::create()), 17 | callbacks_bound_(false), 18 | should_cache_metrics_(false) {} 19 | 20 | CuptiManager& CuptiManager::instance() { 21 | static std::unique_ptr manager(new CuptiManager()); 22 | return *manager; 23 | } 24 | 25 | // CuptiManager::allocateTracer() is defined in cupti_tracer.cpp 26 | 27 | void CuptiManager::unloadCupti() { 28 | if (tracers_.size() > 0) { 29 | throw std::runtime_error("Cannot unload CUPTI because at least one tracer is still bound."); 30 | } 31 | cudaDeviceSynchronize(); 32 | cuptiActivityFlushAll(0); 33 | cuptiFinalize(); 34 | callbacks_bound_ = false; 35 | } 36 | 37 | void CuptiManager::newKernelInstance(KernelInstance info) { 38 | for (auto& tracer : tracers_) { 39 | tracer->kernels_.push_back(info); 40 | } 41 | } 42 | 43 | void CuptiManager::measureMetric( 44 | const std::string& metric_name, 45 | std::function runnable, 46 | std::vector& kernels) { 47 | if (tracers_.size() > 0) { 48 | throw std::runtime_error( 49 | "A CuptiTracer instance still exists. Metrics cannot be measured when tracing is being performed."); 50 | } 51 | 52 | // If the cache can fulfil the metrics request, just use the cache 53 | if (should_cache_metrics_ && 54 | std::all_of(kernels.cbegin(), kernels.cend(), [&](auto& kernel) { 55 | auto it = metrics_cache_.find(kernel.metadata()); 56 | return it != metrics_cache_.end() && it->second.count(metric_name) > 0; 57 | })) { 58 | 59 | for (auto& kernel : kernels) { 60 | auto it = metrics_cache_.find(kernel.metadata()); 61 | double value = it->second.at(metric_name); 62 | kernel.addMetric(metric_name, value); 63 | } 64 | return; 65 | } 66 | 67 | // Otherwise, run the profiler 68 | profiler_->profile(metric_name, runnable, kernels); 69 | 70 | // Update the cache if needed 71 | if (should_cache_metrics_) { 72 | for (auto& kernel : kernels) { 73 | auto& metrics_map = metrics_cache_[kernel.metadata()]; 74 | for (const auto& metric_pair : kernel.metrics()) { 75 | // NOTE: Map insert/emplace will only do the actual insertion if the key has not been used. 76 | // Otherwise it just returns an iterator to the existing value. 77 | metrics_map.emplace(metric_pair); 78 | } 79 | } 80 | } 81 | } 82 | 83 | void CuptiManager::setCacheMetrics(bool should_cache) { 84 | if (should_cache_metrics_ && !should_cache) { 85 | metrics_cache_.clear(); 86 | } 87 | should_cache_metrics_ = should_cache; 88 | } 89 | 90 | bool CuptiManager::isCachingMetrics() const { 91 | return should_cache_metrics_; 92 | } 93 | 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /cpp/src/cuda/cupti_profiler.cpp: -------------------------------------------------------------------------------- 1 | #include "cupti_profiler.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "legacy_cupti_profiler.h" 8 | #include "new_cupti_profiler.h" 9 | #include "cuda_macros.h" 10 | 11 | namespace habitat { 12 | namespace cuda { 13 | 14 | std::unique_ptr CuptiProfiler::create() { 15 | cudaDeviceProp properties; 16 | RUNTIME_API_CALL(cudaGetDeviceProperties(&properties, 0)); 17 | if (properties.major >= 7) { 18 | // The new profiler is used for Volta and newer GPUs 19 | return std::unique_ptr(new NewCuptiProfiler()); 20 | } else { 21 | return std::unique_ptr(new LegacyCuptiProfiler()); 22 | } 23 | } 24 | 25 | void CuptiProfiler::addMetrics( 26 | std::vector& kernels, const std::vector& metrics, const std::string& metric_name) { 27 | // Right now our "profiling model" is that only one metric is measured. NVIDIA's CUPTI documentation is 28 | // unfortunately not that clear, so we don't know exactly how profiled "ranges" map to kernels. Right now 29 | // we assume that the order of metric values matches the order in which the kernels were launched. 30 | if (kernels.size() != metrics.size()) { 31 | // Not sure how to proceed - we should throw an exception to be safe. 32 | throw std::runtime_error("Encountered a KernelInstance and metrics vector size mismatch!"); 33 | } 34 | 35 | for (size_t i = 0; i < kernels.size(); i++) { 36 | kernels.at(i).addMetric(metric_name, metrics.at(i).metricValue()); 37 | } 38 | } 39 | 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /cpp/src/cuda/cupti_profiler.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "kernel.h" 8 | #include "metrics.h" 9 | 10 | namespace habitat { 11 | namespace cuda { 12 | 13 | class CuptiManager; 14 | 15 | class CuptiProfiler { 16 | public: 17 | virtual ~CuptiProfiler() {} 18 | 19 | /** 20 | * Perform the profiling to measure the requested metric. 21 | */ 22 | virtual void profile( 23 | const std::string& metric_name, 24 | std::function runnable, 25 | std::vector& kernels) const = 0; 26 | 27 | protected: 28 | CuptiProfiler() {} 29 | 30 | /** 31 | * Utility function used to add measured kernel metrics to their associated KernelInstances. 32 | */ 33 | static void addMetrics( 34 | std::vector& kernels, 35 | const std::vector& metrics, 36 | const std::string& metric_name); 37 | 38 | private: 39 | static std::unique_ptr create(); 40 | friend class CuptiManager; 41 | }; 42 | 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /cpp/src/cuda/cupti_tracer.cpp: -------------------------------------------------------------------------------- 1 | #include "habitat_cupti.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | #include "cupti_macros.h" 12 | 13 | #define BUF_SIZE (32 * 1024) 14 | #define ALIGN_SIZE (8) 15 | #define ALIGN_BUFFER(buffer, align) \ 16 | (((uintptr_t) (buffer) & ((align)-1)) ? ((buffer) + (align) - ((uintptr_t) (buffer) & ((align)-1))) : (buffer)) 17 | 18 | namespace { 19 | 20 | void handleCuptiActivity(CUpti_Activity* record) { 21 | if (record->kind != CUPTI_ACTIVITY_KIND_KERNEL && record->kind != CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL) { 22 | return; 23 | } 24 | CUpti_ActivityKernel4* kernel = (CUpti_ActivityKernel4 *) record; 25 | habitat::cuda::KernelInstance instance( 26 | habitat::cuda::KernelMetadata( 27 | std::string(kernel->name), 28 | kernel->gridX * kernel->gridY * kernel->gridZ, 29 | kernel->blockX * kernel->blockY * kernel->blockZ, 30 | kernel->dynamicSharedMemory, 31 | kernel->staticSharedMemory, 32 | kernel->registersPerThread), 33 | kernel->end - kernel->start); 34 | habitat::cuda::CuptiManager::instance().newKernelInstance(std::move(instance)); 35 | } 36 | 37 | void cuptiBufferRequested(uint8_t **buffer, size_t *size, size_t *maxNumRecords) { 38 | uint8_t *bfr = (uint8_t *) malloc(BUF_SIZE + ALIGN_SIZE); 39 | if (bfr == NULL) { 40 | std::cerr << "ERROR: Out of memory! (malloc in CUPTI)" << std::endl; 41 | exit(-1); 42 | } 43 | 44 | *size = BUF_SIZE; 45 | *buffer = ALIGN_BUFFER(bfr, ALIGN_SIZE); 46 | *maxNumRecords = 0; 47 | } 48 | 49 | void cuptiBufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize) { 50 | CUptiResult status; 51 | CUpti_Activity *record = NULL; 52 | 53 | if (validSize > 0) { 54 | do { 55 | status = cuptiActivityGetNextRecord(buffer, validSize, &record); 56 | if (status == CUPTI_SUCCESS) { 57 | handleCuptiActivity(record); 58 | 59 | } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) { 60 | break; 61 | 62 | } else { 63 | CUPTI_CALL(status); 64 | } 65 | } while (1); 66 | 67 | // report any records dropped from the queue 68 | size_t dropped; 69 | CUPTI_CALL(cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); 70 | if (dropped != 0) { 71 | std::cerr << "WARNING: CUPTI dropped " << dropped << " activity records." << std::endl; 72 | } 73 | } 74 | 75 | free(buffer); 76 | } 77 | 78 | inline void enableCuptiRecording() { 79 | CUPTI_CALL(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL)); 80 | } 81 | 82 | inline void flushCupti() { 83 | cudaDeviceSynchronize(); 84 | cuptiActivityFlushAll(0); 85 | } 86 | 87 | inline void disableCuptiRecording() { 88 | flushCupti(); 89 | CUPTI_CALL(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL)); 90 | } 91 | 92 | } 93 | 94 | namespace habitat { 95 | namespace cuda { 96 | 97 | CuptiTracer::CuptiTracer() {} 98 | 99 | std::vector&& CuptiTracer::kernels() && { 100 | flushCupti(); 101 | return std::move(kernels_); 102 | } 103 | 104 | std::vector CuptiTracer::lastKernels(size_t num_iterations) const { 105 | flushCupti(); 106 | // NOTE: We assume that, after this point, no more kernels will be appended to the kernels_ vector. 107 | if (kernels_.size() % num_iterations != 0) { 108 | throw std::runtime_error("Recorded kernel size mismatch!"); 109 | } 110 | 111 | size_t num_kernels = kernels_.size() / num_iterations; 112 | std::vector results; 113 | results.reserve(num_kernels); 114 | results.insert(results.begin(), kernels_.end() - num_kernels, kernels_.end()); 115 | return results; 116 | } 117 | 118 | CuptiTracer::Ptr CuptiManager::allocateTracer() { 119 | if (!callbacks_bound_) { 120 | CUPTI_CALL(cuptiActivityRegisterCallbacks(cuptiBufferRequested, cuptiBufferCompleted)); 121 | callbacks_bound_ = true; 122 | } 123 | if (tracers_.size() == 0) { 124 | enableCuptiRecording(); 125 | } 126 | CuptiTracer::Ptr ptr(new CuptiTracer(), &detail::cuptiTracerDeleter); 127 | tracers_.push_back(ptr.get()); 128 | return ptr; 129 | } 130 | 131 | namespace detail { 132 | 133 | void cuptiTracerDeleter(CuptiTracer* tracer) { 134 | CuptiManager& manager = CuptiManager::instance(); 135 | auto it = std::find(manager.tracers_.begin(), manager.tracers_.end(), tracer); 136 | if (it == manager.tracers_.end()) { 137 | // Assertion failure 138 | throw std::runtime_error("Did not find CUPTI tracer in the manager's list when deleting!"); 139 | } 140 | manager.tracers_.erase(it); 141 | if (manager.tracers_.size() == 0) { 142 | disableCuptiRecording(); 143 | } 144 | delete tracer; 145 | } 146 | 147 | } 148 | } 149 | } 150 | -------------------------------------------------------------------------------- /cpp/src/cuda/diagnostics.cu: -------------------------------------------------------------------------------- 1 | #define SIZE 10000 2 | 3 | namespace { 4 | 5 | // We need this global variable to ensure that nvcc does not optimize away the 6 | // operations inside flop_test(). 7 | __device__ float accum = 0.; 8 | 9 | __global__ void flop_test() { 10 | float a = 0.1; 11 | #pragma unroll 12 | for (size_t i = 0; i < SIZE; i++) { 13 | accum += a; 14 | } 15 | } 16 | 17 | } 18 | 19 | namespace habitat { 20 | namespace cuda { 21 | namespace diagnostics { 22 | 23 | void run_flop_test(size_t num_blocks, size_t threads_per_block) { 24 | flop_test<<>>(); 25 | } 26 | 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /cpp/src/cuda/diagnostics.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace habitat { 4 | namespace cuda { 5 | namespace diagnostics { 6 | 7 | /** 8 | * Launches a single kernel that repeatedly performs 32-bit floating point adds. 9 | * 10 | * This diagnostic kernel is used to help us determine the peak performance 11 | * (GFLOP/s) of a device. 12 | */ 13 | void run_flop_test(size_t num_blocks = 8192, size_t threads_per_block = 256); 14 | 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /cpp/src/cuda/habitat_cupti.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "kernel.h" 10 | #include "metrics.h" 11 | #include "cupti_profiler.h" 12 | 13 | namespace habitat { 14 | namespace cuda { 15 | 16 | class CuptiManager; 17 | class CuptiTracer; 18 | 19 | namespace detail { 20 | 21 | void cuptiTracerDeleter(CuptiTracer* tracer); 22 | 23 | } 24 | 25 | /** 26 | * Accumulates kernel invocations during its lifetime. 27 | */ 28 | class CuptiTracer { 29 | public: 30 | using Ptr = std::unique_ptr; 31 | 32 | CuptiTracer(const CuptiTracer&) = delete; 33 | CuptiTracer& operator=(const CuptiTracer&) = delete; 34 | 35 | /** 36 | * Grab all the recorded kernels. 37 | */ 38 | std::vector&& kernels() &&; 39 | 40 | /** 41 | * Assuming the same operation was recorded multiple times, this returns the kernels in the last operation. 42 | */ 43 | std::vector lastKernels(size_t num_iterations) const; 44 | 45 | private: 46 | CuptiTracer(); 47 | std::vector kernels_; 48 | 49 | friend class CuptiManager; 50 | }; 51 | 52 | /** 53 | * Singleton that manages bindings to CUPTI. 54 | */ 55 | class CuptiManager { 56 | public: 57 | static CuptiManager& instance(); 58 | CuptiTracer::Ptr allocateTracer(); 59 | 60 | /** 61 | * Measures the specified metric for the kernels invoked by a given runnable. 62 | * 63 | * The metrics will be appended to the respective KernelInstances passed in to 64 | * this method. 65 | * 66 | * NOTE: The kernels invoked by the runnable must already exist as KernelInstances 67 | * (i.e. the tracer must be used first). This is because some of the CUPTI 68 | * metrics APIs require the execution time of the kernels. 69 | */ 70 | void measureMetric( 71 | const std::string& metric_name, 72 | std::function runnable, 73 | std::vector& kernels); 74 | 75 | /** 76 | * Ensures CUPTI is unloaded from the process. 77 | * 78 | * This throws an exception if there are still CuptiTracers bound to the manager. 79 | */ 80 | void unloadCupti(); 81 | 82 | /** 83 | * This method is NOT intended to be called directly by end users. 84 | * 85 | * This method is called by CUPTI when recording a trace of the kernels that have been executed. 86 | */ 87 | void newKernelInstance(KernelInstance info); 88 | 89 | /** 90 | * Use this method to control whether or not the manager should cache kernel metrics. 91 | * This can be useful because metrics gathering is very slow on Volta and newer generations. 92 | * 93 | * By default caching is disabled. 94 | */ 95 | void setCacheMetrics(bool should_cache); 96 | 97 | /** 98 | * Returns whether or not the manager is caching kernel metrics. 99 | */ 100 | bool isCachingMetrics() const; 101 | 102 | CuptiManager(const CuptiManager&) = delete; 103 | CuptiManager& operator=(const CuptiManager&) = delete; 104 | 105 | private: 106 | CuptiManager(); 107 | std::unique_ptr profiler_; 108 | std::vector tracers_; 109 | std::unordered_map> metrics_cache_; 110 | bool callbacks_bound_; 111 | bool should_cache_metrics_; 112 | 113 | friend void detail::cuptiTracerDeleter(CuptiTracer *tracer); 114 | }; 115 | 116 | /** 117 | * RAII-helper to ensure CUPTI is unloaded. 118 | */ 119 | class CuptiGuard { 120 | public: 121 | ~CuptiGuard() { 122 | CuptiManager::instance().unloadCupti(); 123 | } 124 | }; 125 | 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /cpp/src/cuda/kernel.cpp: -------------------------------------------------------------------------------- 1 | #include "kernel.h" 2 | 3 | namespace { 4 | 5 | // We don't want any symbols from this header to be visible outside of this file 6 | #include "cuda_occupancy.h" 7 | 8 | cudaOccDeviceProp occDeviceProps(const habitat::cuda::DeviceProperties& properties) { 9 | cudaOccDeviceProp device_properties; 10 | device_properties.computeMajor = properties.compute_major; 11 | device_properties.computeMinor = properties.compute_minor; 12 | device_properties.maxThreadsPerBlock = properties.max_threads_per_block; 13 | device_properties.maxThreadsPerMultiprocessor = properties.max_threads_per_multiprocessor; 14 | device_properties.regsPerBlock = properties.regs_per_block; 15 | device_properties.regsPerMultiprocessor = properties.regs_per_multiprocessor; 16 | device_properties.warpSize = properties.warp_size; 17 | device_properties.sharedMemPerBlock = properties.shared_mem_per_block; 18 | device_properties.sharedMemPerMultiprocessor = properties.shared_mem_per_multiprocessor; 19 | device_properties.numSms = properties.num_sms; 20 | device_properties.sharedMemPerBlockOptin = properties.shared_mem_per_block_optin; 21 | return device_properties; 22 | } 23 | 24 | } 25 | 26 | namespace habitat { 27 | namespace cuda { 28 | 29 | uint32_t KernelMetadata::threadBlockOccupancy(const DeviceProperties& device) const { 30 | return threadBlockOccupancy(device, registers_per_thread_); 31 | } 32 | 33 | uint32_t KernelMetadata::threadBlockOccupancy( 34 | const DeviceProperties& device, uint16_t registers_per_thread) const { 35 | cudaOccDeviceProp device_properties(occDeviceProps(device)); 36 | cudaOccDeviceState device_state; 37 | cudaOccFuncAttributes attributes; 38 | attributes.maxThreadsPerBlock = INT_MAX; 39 | attributes.maxDynamicSharedSizeBytes = INT_MAX; 40 | attributes.numRegs = registers_per_thread; 41 | attributes.sharedSizeBytes = static_shared_memory_; 42 | 43 | int res; 44 | cudaOccResult result; 45 | if ((res = cudaOccMaxActiveBlocksPerMultiprocessor( 46 | &result, 47 | &device_properties, 48 | &attributes, 49 | &device_state, 50 | block_size_, 51 | dynamic_shared_memory_)) != CUDA_OCC_SUCCESS) { 52 | return 0; 53 | } 54 | 55 | return result.activeBlocksPerMultiprocessor; 56 | } 57 | 58 | bool operator==(const KernelMetadata& lhs, const KernelMetadata& rhs) { 59 | return lhs.num_blocks_ == rhs.num_blocks_ && 60 | lhs.block_size_ == rhs.block_size_ && 61 | lhs.dynamic_shared_memory_ == rhs.dynamic_shared_memory_ && 62 | lhs.static_shared_memory_ == rhs.static_shared_memory_ && 63 | lhs.registers_per_thread_ == rhs.registers_per_thread_ && 64 | lhs.name_ == rhs.name_; 65 | } 66 | 67 | void KernelInstance::addMetric(std::string name, double value) { 68 | metrics_.push_back(std::make_pair(std::move(name), value)); 69 | } 70 | 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /cpp/src/cuda/kernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "sampled_measurement.h" 9 | #include "utils.h" 10 | 11 | namespace habitat { 12 | namespace cuda { 13 | 14 | struct DeviceProperties { 15 | DeviceProperties( 16 | std::string name, 17 | int compute_major, 18 | int compute_minor, 19 | int max_threads_per_block, 20 | int max_threads_per_multiprocessor, 21 | int regs_per_block, 22 | int regs_per_multiprocessor, 23 | int warp_size, 24 | size_t shared_mem_per_block, 25 | size_t shared_mem_per_multiprocessor, 26 | int num_sms, 27 | size_t shared_mem_per_block_optin, 28 | int mem_bandwidth_gb, 29 | size_t base_clock_mhz, 30 | size_t peak_gflops_per_second) 31 | : name(std::move(name)), 32 | compute_major(compute_major), 33 | compute_minor(compute_minor), 34 | max_threads_per_block(max_threads_per_block), 35 | max_threads_per_multiprocessor(max_threads_per_multiprocessor), 36 | regs_per_block(regs_per_block), 37 | regs_per_multiprocessor(regs_per_multiprocessor), 38 | warp_size(warp_size), 39 | shared_mem_per_block(shared_mem_per_block), 40 | shared_mem_per_multiprocessor(shared_mem_per_multiprocessor), 41 | num_sms(num_sms), 42 | shared_mem_per_block_optin(shared_mem_per_block_optin), 43 | mem_bandwidth_gb(mem_bandwidth_gb), 44 | base_clock_mhz(base_clock_mhz), 45 | peak_gflops_per_second(peak_gflops_per_second) {} 46 | 47 | std::string name; 48 | int compute_major; 49 | int compute_minor; 50 | int max_threads_per_block; 51 | int max_threads_per_multiprocessor; 52 | int regs_per_block; 53 | int regs_per_multiprocessor; 54 | int warp_size; 55 | size_t shared_mem_per_block; 56 | size_t shared_mem_per_multiprocessor; 57 | int num_sms; 58 | size_t shared_mem_per_block_optin; 59 | int mem_bandwidth_gb; 60 | size_t base_clock_mhz; 61 | size_t peak_gflops_per_second; 62 | }; 63 | 64 | class KernelMetadata { 65 | public: 66 | KernelMetadata( 67 | std::string name, 68 | int32_t num_blocks, 69 | int32_t block_size, 70 | int32_t dynamic_shared_memory, 71 | int32_t static_shared_memory, 72 | uint16_t registers_per_thread) 73 | : name_(std::move(name)), 74 | num_blocks_(num_blocks), 75 | block_size_(block_size), 76 | dynamic_shared_memory_(dynamic_shared_memory), 77 | static_shared_memory_(static_shared_memory), 78 | registers_per_thread_(registers_per_thread) {} 79 | 80 | const std::string& name() const { 81 | return name_; 82 | } 83 | 84 | int32_t numBlocks() const { 85 | return num_blocks_; 86 | } 87 | 88 | int32_t blockSize() const { 89 | return block_size_; 90 | } 91 | 92 | int32_t dynamicSharedMemory() const { 93 | return dynamic_shared_memory_; 94 | } 95 | 96 | int32_t staticSharedMemory() const { 97 | return static_shared_memory_; 98 | } 99 | 100 | uint16_t registersPerThread() const { 101 | return registers_per_thread_; 102 | } 103 | 104 | /** 105 | * Returns the theoretical thread block occupancy for this kernel when running on a 106 | * specified GPU. The return value is zero if an error occurred. 107 | */ 108 | uint32_t threadBlockOccupancy(const DeviceProperties& device, uint16_t registers_per_thread) const; 109 | 110 | /** 111 | * Returns the theoretical thread block occupancy for this kernel when running on a specified 112 | * GPU using the same number of registers as the kernel on the measured device. The return 113 | * value is zero if an error occurred. 114 | */ 115 | uint32_t threadBlockOccupancy(const DeviceProperties& device) const; 116 | 117 | friend bool operator==(const KernelMetadata& lhs, const KernelMetadata& rhs); 118 | 119 | private: 120 | std::string name_; 121 | int32_t num_blocks_; 122 | int32_t block_size_; 123 | int32_t dynamic_shared_memory_; 124 | int32_t static_shared_memory_; 125 | uint16_t registers_per_thread_; 126 | }; 127 | 128 | class KernelInstance { 129 | public: 130 | KernelInstance(KernelMetadata metadata, uint64_t run_time_ns) 131 | : metadata_(std::move(metadata)), run_time_ns_(run_time_ns) {} 132 | 133 | const KernelMetadata& metadata() const { 134 | return metadata_; 135 | } 136 | 137 | uint64_t runTimeNs() const { 138 | return run_time_ns_; 139 | } 140 | 141 | void addMetric(std::string name, double value); 142 | 143 | const std::vector>& metrics() const { 144 | return metrics_; 145 | } 146 | 147 | private: 148 | KernelMetadata metadata_; 149 | uint64_t run_time_ns_; 150 | std::vector> metrics_; 151 | }; 152 | 153 | } 154 | } 155 | 156 | namespace std { 157 | 158 | // Allow KernelMetadata to serve as a key in std::unordered_map 159 | template <> 160 | struct hash { 161 | std::size_t operator()(const habitat::cuda::KernelMetadata& m) const { 162 | return habitat::utils::hash_combine( 163 | m.name(), 164 | m.numBlocks(), 165 | m.blockSize(), 166 | m.dynamicSharedMemory(), 167 | m.staticSharedMemory(), 168 | m.registersPerThread()); 169 | } 170 | }; 171 | 172 | } 173 | -------------------------------------------------------------------------------- /cpp/src/cuda/legacy_cupti_profiler.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "cupti_profiler.h" 8 | #include "kernel.h" 9 | 10 | namespace habitat { 11 | namespace cuda { 12 | 13 | /** 14 | * Uses the legacy CUPTI metrics APIs (Pascal and older GPUs). 15 | */ 16 | class LegacyCuptiProfiler : public CuptiProfiler { 17 | public: 18 | void profile( 19 | const std::string& metric_name, 20 | std::function runnable, 21 | std::vector& kernels) const override; 22 | 23 | private: 24 | LegacyCuptiProfiler(); 25 | ~LegacyCuptiProfiler(); 26 | friend class CuptiProfiler; 27 | 28 | class Impl; 29 | std::unique_ptr impl_; 30 | }; 31 | 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /cpp/src/cuda/metrics.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace habitat { 6 | namespace cuda { 7 | 8 | class KernelMetric { 9 | public: 10 | KernelMetric(std::string kernel_name, double metric_value) 11 | : kernel_name_(std::move(kernel_name)), 12 | metric_value_(metric_value) {} 13 | 14 | const std::string& kernelName() const { 15 | return kernel_name_; 16 | } 17 | 18 | const double& metricValue() const { 19 | return metric_value_; 20 | } 21 | 22 | private: 23 | std::string kernel_name_; 24 | double metric_value_; 25 | }; 26 | 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /cpp/src/cuda/new_cupti_profiler.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "cupti_profiler.h" 8 | #include "kernel.h" 9 | #include "metrics.h" 10 | 11 | namespace habitat { 12 | namespace cuda { 13 | 14 | /** 15 | * Uses the new PerfWorks-based CUPTI profiling APIs (Volta and newer). 16 | */ 17 | class NewCuptiProfiler : public CuptiProfiler { 18 | public: 19 | void profile( 20 | const std::string& metric_name, 21 | std::function runnable, 22 | std::vector& kernels) const override; 23 | 24 | private: 25 | NewCuptiProfiler(); 26 | ~NewCuptiProfiler(); 27 | friend class CuptiProfiler; 28 | 29 | // We lazily initialize the profiler to prevent CUPTI from potentially 30 | // introducing overhead when profiling NOT is used. 31 | void initialize() const; 32 | 33 | class State; 34 | mutable std::unique_ptr state_; 35 | }; 36 | 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /cpp/src/cuda/sampled_measurement.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace habitat { 7 | namespace cuda { 8 | 9 | template 10 | struct SampledMeasurement { 11 | SampledMeasurement(T median, T min, T max) : median(median), min(min), max(max) { 12 | if (min > max || max < median || min > median) { 13 | throw std::runtime_error("Invalid values passed into SampledMeasurement."); 14 | } 15 | } 16 | explicit SampledMeasurement(T value) : SampledMeasurement(value, value, value) {} 17 | 18 | template 19 | static SampledMeasurement fromValues(std::vector& values, Mapper mapper) { 20 | std::vector mapped_values; 21 | mapped_values.resize(values.size()); 22 | std::transform(values.begin(), values.end(), mapped_values.begin(), mapper); 23 | return SampledMeasurement::fromValues(mapped_values); 24 | } 25 | 26 | static SampledMeasurement fromValues(std::vector& values) { 27 | std::sort(values.begin(), values.end()); 28 | size_t mid = values.size() / 2; 29 | 30 | if (values.size() % 2 == 0) { 31 | T mid1 = values.at(mid); 32 | T mid2 = values.at(mid - 1); 33 | return SampledMeasurement((mid1 + mid2) / 2, values.front(), values.back()); 34 | 35 | } else { 36 | return SampledMeasurement(values.at(mid), values.front(), values.back()); 37 | } 38 | } 39 | 40 | T median; 41 | T min; 42 | T max; 43 | }; 44 | 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /cpp/src/cuda/utils-inl.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace habitat { 4 | namespace utils { 5 | 6 | namespace detail { 7 | 8 | // Combining hashes from: https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x 9 | inline void hash_combine(std::size_t& seed) {} 10 | 11 | template 12 | inline void hash_combine(std::size_t& seed, const T& v, Rest... rest) { 13 | std::hash hasher; 14 | seed ^= hasher(v) + 0x9e3779b9 + (seed<<6) + (seed>>2); 15 | hash_combine(seed, rest...); 16 | } 17 | 18 | } 19 | 20 | template 21 | inline std::size_t hash_combine(Values... values) { 22 | std::size_t seed = 0; 23 | detail::hash_combine(seed, values...); 24 | return seed; 25 | } 26 | 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /cpp/src/cuda/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace habitat { 4 | namespace utils { 5 | 6 | /** 7 | * Hashes and combines several values together. 8 | * 9 | * Usage: 10 | * std::size_t hash = habitat::utils::hash_combine("hello", "world", 1337); 11 | * 12 | */ 13 | template 14 | inline std::size_t hash_combine(Values... values); 15 | 16 | } 17 | } 18 | 19 | #include "utils-inl.h" 20 | -------------------------------------------------------------------------------- /cpp/src/device_info.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | DEFINE_int32(device, 0, "The ID of the device for which information should be extracted."); 8 | 9 | int main(int argc, char* argv[]) { 10 | std::string usage("Utility that extracts usage information about the GPU(s) on this machine.\nUsage: "); 11 | usage += argv[0]; 12 | gflags::SetUsageMessage(usage); 13 | gflags::SetVersionString("0.1.0"); 14 | gflags::ParseCommandLineFlags(&argc, &argv, /* remove_flags */ true); 15 | 16 | cudaDeviceProp props; 17 | memset(&props, 0, sizeof(cudaDeviceProp)); 18 | cudaGetDeviceProperties(&props, FLAGS_device); 19 | 20 | std::cout << "compute_major: " << props.major << std::endl; 21 | std::cout << "compute_minor: " << props.minor << std::endl; 22 | std::cout << "max_threads_per_block: " << props.maxThreadsPerBlock << std::endl; 23 | std::cout << "max_threads_per_multiprocessor: " << props.maxThreadsPerMultiProcessor << std::endl; 24 | std::cout << "regs_per_block: " << props.regsPerBlock << std::endl; 25 | std::cout << "regs_per_multiprocessor: " << props.regsPerMultiprocessor << std::endl; 26 | std::cout << "warp_size: " << props.warpSize << std::endl; 27 | std::cout << "shared_mem_per_block: " << props.sharedMemPerBlock << std::endl; 28 | std::cout << "shared_mem_per_multiprocessor: " << props.sharedMemPerMultiprocessor << std::endl; 29 | std::cout << "num_sms: " << props.multiProcessorCount << std::endl; 30 | std::cout << "shared_mem_per_block_optin: " << props.sharedMemPerBlockOptin << std::endl; 31 | 32 | return 0; 33 | } 34 | -------------------------------------------------------------------------------- /cpp/src/frontend/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | target_sources(habitat_cuda 2 | PRIVATE 3 | model_bindings.cpp 4 | profiler.cpp 5 | ) 6 | -------------------------------------------------------------------------------- /cpp/src/frontend/model_bindings.cpp: -------------------------------------------------------------------------------- 1 | #include "model_bindings.h" 2 | 3 | #include 4 | #include 5 | 6 | #include "../cuda/kernel.h" 7 | 8 | namespace py = pybind11; 9 | using habitat::cuda::DeviceProperties; 10 | using habitat::cuda::KernelInstance; 11 | using habitat::cuda::KernelMetadata; 12 | 13 | namespace habitat { 14 | namespace frontend { 15 | 16 | void bindModels(pybind11::module& m) { 17 | py::class_(m, "DeviceProperties") 18 | .def( 19 | py::init(), 20 | py::arg("name"), 21 | py::arg("compute_major"), 22 | py::arg("compute_minor"), 23 | py::arg("max_threads_per_block"), 24 | py::arg("max_threads_per_multiprocessor"), 25 | py::arg("regs_per_block"), 26 | py::arg("regs_per_multiprocessor"), 27 | py::arg("warp_size"), 28 | py::arg("shared_mem_per_block"), 29 | py::arg("shared_mem_per_multiprocessor"), 30 | py::arg("num_sms"), 31 | py::arg("shared_mem_per_block_optin"), 32 | py::arg("mem_bandwidth_gb"), 33 | py::arg("base_clock_mhz"), 34 | py::arg("peak_gflops_per_second")) 35 | .def("__repr__", [](const DeviceProperties& self) { 36 | return std::string("DeviceProperties(name=" + self.name + ")"); 37 | }, py::return_value_policy::move) 38 | .def_property_readonly("name", [](const DeviceProperties& self) { 39 | return self.name; 40 | }) 41 | .def_property_readonly("num_sms", [](const DeviceProperties& self) { 42 | return self.num_sms; 43 | }) 44 | .def_property_readonly("mem_bandwidth_gb", [](const DeviceProperties& self) { 45 | return self.mem_bandwidth_gb; 46 | }) 47 | .def_property_readonly("compute_capability", [](const DeviceProperties& self) { 48 | return py::make_tuple(self.compute_major, self.compute_minor); 49 | }) 50 | .def_property_readonly("base_clock_mhz", [](const DeviceProperties& self) { 51 | return self.base_clock_mhz; 52 | }) 53 | .def_property_readonly("peak_gflops_per_second", [](const DeviceProperties& self) { 54 | return self.peak_gflops_per_second; 55 | }); 56 | 57 | py::class_(m, "KernelInstance") 58 | .def_property_readonly("name", [](const KernelInstance& self) { 59 | return self.metadata().name(); 60 | }, py::return_value_policy::reference_internal) 61 | .def_property_readonly("run_time_ns", &KernelInstance::runTimeNs, py::return_value_policy::reference_internal) 62 | .def_property_readonly("num_blocks", [](const KernelInstance& self) { return self.metadata().numBlocks(); }) 63 | .def_property_readonly("metrics", &KernelInstance::metrics, py::return_value_policy::reference_internal) 64 | .def_property_readonly("metadata", [](const KernelInstance& self) { 65 | py::dict metadata; 66 | const KernelMetadata& kernel_metadata = self.metadata(); 67 | metadata["name"] = kernel_metadata.name(); 68 | metadata["num_blocks"] = kernel_metadata.numBlocks(); 69 | metadata["block_size"] = kernel_metadata.blockSize(); 70 | metadata["static_shared_memory"] = kernel_metadata.staticSharedMemory(); 71 | metadata["dynamic_shared_memory"] = kernel_metadata.dynamicSharedMemory(); 72 | metadata["registers_per_thread"] = kernel_metadata.registersPerThread(); 73 | return metadata; 74 | }) 75 | .def("thread_block_occupancy", []( 76 | const KernelInstance& self, const DeviceProperties& device, int registers_per_thread) { 77 | if (registers_per_thread < 0) { 78 | return self.metadata().threadBlockOccupancy(device); 79 | } else { 80 | return self.metadata().threadBlockOccupancy(device, registers_per_thread); 81 | } 82 | }, py::arg("device"), py::arg("registers_per_thread") = -1); 83 | } 84 | 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /cpp/src/frontend/model_bindings.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace habitat { 6 | namespace frontend { 7 | 8 | void bindModels(pybind11::module& m); 9 | 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /cpp/src/frontend/profiler.cpp: -------------------------------------------------------------------------------- 1 | #include "profiler.h" 2 | 3 | #include "../cuda/habitat_cupti.h" 4 | 5 | using habitat::cuda::CuptiManager; 6 | using habitat::cuda::CuptiTracer; 7 | using habitat::cuda::KernelInstance; 8 | 9 | namespace { 10 | 11 | std::vector measureRunTimes( 12 | const std::function& runnable) { 13 | std::vector kernels; 14 | { 15 | CuptiTracer::Ptr tracer = CuptiManager::instance().allocateTracer(); 16 | runnable(); 17 | kernels = std::move(*tracer).kernels(); 18 | } 19 | return kernels; 20 | } 21 | 22 | } 23 | 24 | namespace habitat { 25 | namespace frontend { 26 | 27 | void setCacheMetrics(bool should_cache) { 28 | CuptiManager::instance().setCacheMetrics(should_cache); 29 | } 30 | 31 | std::vector profile(std::function runnable) { 32 | return measureRunTimes(runnable); 33 | } 34 | 35 | std::vector profile( 36 | std::function runnable, const std::string& metric) { 37 | std::vector kernels(measureRunTimes(runnable)); 38 | CuptiManager::instance().measureMetric(metric, std::move(runnable), kernels); 39 | return kernels; 40 | } 41 | 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /cpp/src/frontend/profiler.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "../cuda/kernel.h" 8 | 9 | namespace habitat { 10 | namespace frontend { 11 | 12 | void setCacheMetrics(bool should_cache); 13 | 14 | std::vector profile(std::function runnable); 15 | 16 | std::vector profile( 17 | std::function runnable, const std::string& metric); 18 | 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /cpp/src/habitat_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | #include "cuda/diagnostics.h" 7 | #include "frontend/model_bindings.h" 8 | #include "frontend/profiler.h" 9 | 10 | namespace py = pybind11; 11 | 12 | PYBIND11_MODULE(habitat_cuda, m) { 13 | habitat::frontend::bindModels(m); 14 | 15 | m.def("profile", [](py::function runnable_python, const std::string& metric) { 16 | std::function runnable = [runnable_python]() { 17 | runnable_python(); 18 | }; 19 | if (metric.size() == 0) { 20 | return habitat::frontend::profile(std::move(runnable)); 21 | } else { 22 | return habitat::frontend::profile(std::move(runnable), metric); 23 | } 24 | }, py::arg("runnable"), py::arg("metric") = "", py::return_value_policy::move); 25 | 26 | m.def("set_cache_metrics", [](bool should_cache) { 27 | habitat::frontend::setCacheMetrics(should_cache); 28 | }, py::arg("should_cache")); 29 | 30 | m.def_submodule("_diagnostics") 31 | .def("run_flop_test", [](size_t num_blocks, size_t threads_per_block) { 32 | habitat::cuda::diagnostics::run_flop_test(num_blocks, threads_per_block); 33 | }, py::arg("num_blocks") = 8192, py::arg("threads_per_block") = 256); 34 | } 35 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04 2 | MAINTAINER Geoffrey Yu 3 | 4 | RUN apt-get update --fix-missing && \ 5 | apt-get install --no-install-recommends -y software-properties-common && \ 6 | apt-get update && \ 7 | apt-get install --no-install-recommends -y sudo && \ 8 | apt-get install --no-install-recommends -y python3-pip python3-setuptools python3-dev && \ 9 | apt-get install --no-install-recommends -y wget bzip2 ca-certificates libssl-dev && \ 10 | rm -rf /var/lib/apt/lists/* 11 | 12 | RUN pip3 install wheel && pip3 install numpy PyYAML 13 | RUN pip3 install \ 14 | torch==1.4.0 \ 15 | pillow==7.2.0 \ 16 | torchvision==0.5.0 \ 17 | pandas==1.1.2 \ 18 | tqdm==4.49.0 19 | 20 | # Download cmake 21 | RUN wget "https://github.com/Kitware/CMake/releases/download/v3.17.0-rc1/cmake-3.17.0-rc1.tar.gz" -O /opt/cmake-3.17.0-rc1.tar.gz && \ 22 | cd /opt && tar xzf cmake-3.17.0-rc1.tar.gz 23 | 24 | # Install cmake 25 | RUN cd /opt/cmake-3.17.0-rc1 && \ 26 | ./bootstrap && \ 27 | make -j 16 && \ 28 | make install 29 | 30 | # NOTE: gosu is used in create-user.sh 31 | RUN mkdir ~/.gnupg && echo "disable-ipv6" >> ~/.gnupg/dirmngr.conf 32 | RUN gpg --keyserver keyserver.ubuntu.com --recv-keys B42F6819007F00F88E364FD4036A9C25BF357DD4 33 | RUN wget "https://github.com/tianon/gosu/releases/download/1.11/gosu-$(dpkg --print-architecture | awk -F- '{ print $NF }')" -O /usr/local/bin/gosu && \ 34 | wget "https://github.com/tianon/gosu/releases/download/1.11/gosu-$(dpkg --print-architecture | awk -F- '{ print $NF }').asc" -O /usr/local/bin/gosu.asc && \ 35 | gpg --verify /usr/local/bin/gosu.asc && \ 36 | rm /usr/local/bin/gosu.asc && \ 37 | chmod +x /usr/local/bin/gosu 38 | 39 | COPY create-user.sh /usr/local/bin/create-user.sh 40 | RUN chmod +x /usr/local/bin/create-user.sh 41 | ENTRYPOINT ["/usr/local/bin/create-user.sh"] 42 | CMD ["/bin/bash"] 43 | -------------------------------------------------------------------------------- /docker/README.md: -------------------------------------------------------------------------------- 1 | Habitat Docker Image 2 | ===================== 3 | The Dockerfile in this directory specifies a Docker image that is used as our 4 | development and test environment. Run `setup.sh` to build the Docker image. 5 | 6 | To start a container, run the `start.sh` script. The container is set up so 7 | that your current account is duplicated inside the container (with the same 8 | user ID and username). This prevents permission issues when accessing files in 9 | mounted volumes inside and outside the container. The user inside the container 10 | will have `sudo` permissions; the account's password will be set to your 11 | username. 12 | 13 | Note that the `start.sh` script will restart any containers that are stopped 14 | but have not been removed. If you make any changes to the Docker image and/or 15 | want to start a new container, you need to remove the existing container with 16 | `docker rm` before running `start.sh` again. 17 | -------------------------------------------------------------------------------- /docker/create-user.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script is executed each time the container is started to create a user 4 | 5 | if [ -z $CONTAINER_UID ] || [ -z $CONTAINER_UNAME ]; then 6 | echo "Please set the \"CONTAINER_UID\" and \"CONTAINER_UNAME\" environment variables." 7 | exit 1 8 | fi 9 | 10 | # Create the user if they do not exist 11 | if ! id -u ${CONTAINER_UNAME} &> /dev/null; then 12 | # NOTE: The home directory is automatically created because we mount stuff inside it 13 | useradd --shell /bin/bash -u ${CONTAINER_UID} ${CONTAINER_UNAME} && \ 14 | adduser ${CONTAINER_UNAME} sudo && \ 15 | echo "${CONTAINER_UNAME}:${CONTAINER_UNAME}" | chpasswd 16 | 17 | export HOME=/home/${CONTAINER_UNAME} 18 | echo "cd /home/${CONTAINER_UNAME}" >> /home/${CONTAINER_UNAME}/.bashrc 19 | echo "alias ls=\"ls --color\"" >> /home/${CONTAINER_UNAME}/.bashrc 20 | chown ${CONTAINER_UNAME}:${CONTAINER_UNAME} /home/${CONTAINER_UNAME} 21 | chown ${CONTAINER_UNAME}:${CONTAINER_UNAME} /home/${CONTAINER_UNAME}/.bashrc 22 | fi 23 | 24 | exec /usr/local/bin/gosu ${CONTAINER_UNAME} "$@" 25 | -------------------------------------------------------------------------------- /docker/setup.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | SCRIPT_LOCATION=$(cd $(dirname $0) && pwd -P) 3 | source $SCRIPT_LOCATION/vars.sh 4 | 5 | echo "This script will build a habitat container image." 6 | echo "" 7 | read -p "Do you want to continue? (y/n) " -r 8 | if [[ ! $REPLY =~ ^[Yy]$ ]] 9 | then 10 | exit 1 11 | fi 12 | 13 | docker build -t $IMAGE_NAME . 14 | -------------------------------------------------------------------------------- /docker/start.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # This script drops you into a habitat container. The container image must 4 | # exist. If it does not exist, build it first using the setup.sh script. 5 | # 6 | # If no containers exist (stopped or running), this script will run a new container. 7 | # You can optionally pass in another directory to mount as an argument to this script. 8 | # If no arguments are provided to the script, your home directory will be mounted 9 | # inside the container at ~/home. 10 | # 11 | # If a running container exists, this script will just start a new bash shell inside 12 | # the container. 13 | # 14 | # If a stopped container exists, this script will restart it. 15 | 16 | SCRIPT_LOCATION=$(cd $(dirname $0) && pwd -P) 17 | source $SCRIPT_LOCATION/vars.sh 18 | 19 | UNAME=$(id -un) 20 | CONTAINER_NAME=$IMAGE_NAME-$UNAME 21 | 22 | if [ -z $1 ]; then 23 | MOUNT_VOL=$(cd ~ && pwd):/home/$UNAME/home 24 | else 25 | ABS_MOUNT_DIR=$(cd $1 && pwd) 26 | MOUNT_DIR=$(basename $ABS_MOUNT_DIR) 27 | MOUNT_VOL=$ABS_MOUNT_DIR:/home/$UNAME/$MOUNT_DIR 28 | fi 29 | 30 | RUNNING=$(docker ps --filter "status=running" --filter "name=$CONTAINER_NAME" --format "{{.ID}}") 31 | EXITED=$(docker ps --filter "status=exited" --filter "name=$CONTAINER_NAME" --format "{{.ID}}") 32 | 33 | if [ -z "$RUNNING" ] && [ -z "$EXITED" ]; then 34 | # Container was never started 35 | 36 | # NOTE: For stable all reduce measurements, it's important to ensure that the 37 | # shared memory limits are increased using 38 | # 39 | # --shm-size=1g --ulimit memlock=-1 40 | # 41 | docker run -ti \ 42 | -e "CONTAINER_UID=$(id -u)" \ 43 | -e "CONTAINER_UNAME=$(id -un)" \ 44 | --name $CONTAINER_NAME \ 45 | --volume $MOUNT_VOL \ 46 | --runtime=nvidia \ 47 | --workdir=/home/$UNAME \ 48 | --shm-size=1g \ 49 | --ulimit memlock=-1 \ 50 | $IMAGE_NAME 51 | elif [ -z "$RUNNING" ]; then 52 | # Container exited but was not removed. We can restart it. 53 | docker start -ai $EXITED 54 | else 55 | # Already running, so just attach 56 | docker exec -it $CONTAINER_NAME \ 57 | /usr/local/bin/gosu \ 58 | $UNAME \ 59 | /bin/bash 60 | fi 61 | -------------------------------------------------------------------------------- /docker/vars.sh: -------------------------------------------------------------------------------- 1 | VARS_FILE_LOCATION=$(cd $(dirname $0) && pwd -P) 2 | IMAGE_NAME=habitat-cuda10.1-cudnn7 3 | -------------------------------------------------------------------------------- /experiments/.gitignore: -------------------------------------------------------------------------------- 1 | results 2 | -------------------------------------------------------------------------------- /experiments/dcgan/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2017, 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /experiments/dcgan/README.md: -------------------------------------------------------------------------------- 1 | # DCGAN 2 | 3 | The code in this directory is adapted from the PyTorch DCGAN example that can 4 | be found [here](https://github.com/pytorch/examples/tree/master/dcgan). As a 5 | result, it is also licensed under the BSD-3 license (see `LICENSE`). 6 | 7 | The original README can be found below. 8 | 9 | ------------------------------------------- 10 | 11 | # Deep Convolution Generative Adversarial Networks 12 | 13 | This example implements the paper [Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks](http://arxiv.org/abs/1511.06434) 14 | 15 | The implementation is very close to the Torch implementation [dcgan.torch](https://github.com/soumith/dcgan.torch) 16 | 17 | After every 100 training iterations, the files `real_samples.png` and `fake_samples.png` are written to disk 18 | with the samples from the generative model. 19 | 20 | After every epoch, models are saved to: `netG_epoch_%d.pth` and `netD_epoch_%d.pth` 21 | 22 | ## Downloading the dataset 23 | You can download the LSUN dataset by cloning [this repo](https://github.com/fyu/lsun) and running 24 | ``` 25 | python download.py -c bedroom 26 | ``` 27 | 28 | ## Usage 29 | ``` 30 | usage: main.py [-h] --dataset DATASET --dataroot DATAROOT [--workers WORKERS] 31 | [--batchSize BATCHSIZE] [--imageSize IMAGESIZE] [--nz NZ] 32 | [--ngf NGF] [--ndf NDF] [--niter NITER] [--lr LR] 33 | [--beta1 BETA1] [--cuda] [--ngpu NGPU] [--netG NETG] 34 | [--netD NETD] 35 | 36 | optional arguments: 37 | -h, --help show this help message and exit 38 | --dataset DATASET cifar10 | lsun | mnist |imagenet | folder | lfw | fake 39 | --dataroot DATAROOT path to dataset 40 | --workers WORKERS number of data loading workers 41 | --batchSize BATCHSIZE input batch size 42 | --imageSize IMAGESIZE the height / width of the input image to network 43 | --nz NZ size of the latent z vector 44 | --ngf NGF 45 | --ndf NDF 46 | --niter NITER number of epochs to train for 47 | --lr LR learning rate, default=0.0002 48 | --beta1 BETA1 beta1 for adam. default=0.5 49 | --cuda enables cuda 50 | --ngpu NGPU number of GPUs to use 51 | --netG NETG path to netG (to continue training) 52 | --netD NETD path to netD (to continue training) 53 | --outf OUTF folder to output images and model checkpoints 54 | --manualSeed SEED manual seed 55 | --classes CLASSES comma separated list of classes for the lsun data set 56 | ``` 57 | -------------------------------------------------------------------------------- /experiments/dcgan/dcgan.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import random 4 | import torch 5 | import torch.nn as nn 6 | import torch.utils.data 7 | 8 | 9 | def model_config(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--dataroot', required=False, help='path to dataset') 12 | parser.add_argument('--workers', type=int, help='number of data loading workers', default=2) 13 | parser.add_argument('--batchSize', type=int, default=64, help='input batch size') 14 | parser.add_argument('--imageSize', type=int, default=64, help='the height / width of the input image to network') 15 | parser.add_argument('--nz', type=int, default=100, help='size of the latent z vector') 16 | parser.add_argument('--ngf', type=int, default=64) 17 | parser.add_argument('--ndf', type=int, default=64) 18 | parser.add_argument('--niter', type=int, default=25, help='number of epochs to train for') 19 | parser.add_argument('--lr', type=float, default=0.0002, help='learning rate, default=0.0002') 20 | parser.add_argument('--beta1', type=float, default=0.5, help='beta1 for adam. default=0.5') 21 | parser.add_argument('--cuda', action='store_true', help='enables cuda') 22 | parser.add_argument('--dry-run', action='store_true', help='check a single training cycle works') 23 | parser.add_argument('--ngpu', type=int, default=1, help='number of GPUs to use') 24 | parser.add_argument('--netG', default='', help="path to netG (to continue training)") 25 | parser.add_argument('--netD', default='', help="path to netD (to continue training)") 26 | parser.add_argument('--outf', default='.', help='folder to output images and model checkpoints') 27 | parser.add_argument('--manualSeed', type=int, help='manual seed') 28 | parser.add_argument('--classes', default='bedroom', help='comma separated list of classes for the lsun data set') 29 | 30 | opt = parser.parse_args(args=[]) 31 | return opt 32 | 33 | 34 | device = torch.device("cuda") 35 | nz = 100 36 | ngf = 64 37 | ndf = 64 38 | nc = 3 39 | 40 | 41 | # custom weights initialization called on netG and netD 42 | def weights_init(m): 43 | classname = m.__class__.__name__ 44 | if classname.find('Conv') != -1: 45 | torch.nn.init.normal_(m.weight, 0.0, 0.02) 46 | elif classname.find('BatchNorm') != -1: 47 | torch.nn.init.normal_(m.weight, 1.0, 0.02) 48 | torch.nn.init.zeros_(m.bias) 49 | 50 | 51 | class Generator(nn.Module): 52 | def __init__(self, ngpu): 53 | super(Generator, self).__init__() 54 | self.ngpu = ngpu 55 | self.main = nn.Sequential( 56 | # input is Z, going into a convolution 57 | nn.ConvTranspose2d( nz, ngf * 8, 4, 1, 0, bias=False), 58 | nn.BatchNorm2d(ngf * 8), 59 | nn.ReLU(True), 60 | # state size. (ngf*8) x 4 x 4 61 | nn.ConvTranspose2d(ngf * 8, ngf * 4, 4, 2, 1, bias=False), 62 | nn.BatchNorm2d(ngf * 4), 63 | nn.ReLU(True), 64 | # state size. (ngf*4) x 8 x 8 65 | nn.ConvTranspose2d(ngf * 4, ngf * 2, 4, 2, 1, bias=False), 66 | nn.BatchNorm2d(ngf * 2), 67 | nn.ReLU(True), 68 | # state size. (ngf*2) x 16 x 16 69 | nn.ConvTranspose2d(ngf * 2, ngf, 4, 2, 1, bias=False), 70 | nn.BatchNorm2d(ngf), 71 | nn.ReLU(True), 72 | # state size. (ngf) x 32 x 32 73 | nn.ConvTranspose2d( ngf, nc, 4, 2, 1, bias=False), 74 | nn.Tanh() 75 | # state size. (nc) x 64 x 64 76 | ) 77 | 78 | def forward(self, input): 79 | if input.is_cuda and self.ngpu > 1: 80 | output = nn.parallel.data_parallel(self.main, input, range(self.ngpu)) 81 | else: 82 | output = self.main(input) 83 | return output 84 | 85 | 86 | class Discriminator(nn.Module): 87 | def __init__(self, ngpu): 88 | super(Discriminator, self).__init__() 89 | self.ngpu = ngpu 90 | self.main = nn.Sequential( 91 | # input is (nc) x 64 x 64 92 | nn.Conv2d(nc, ndf, 4, 2, 1, bias=False), 93 | nn.LeakyReLU(0.2, inplace=True), 94 | # state size. (ndf) x 32 x 32 95 | nn.Conv2d(ndf, ndf * 2, 4, 2, 1, bias=False), 96 | nn.BatchNorm2d(ndf * 2), 97 | nn.LeakyReLU(0.2, inplace=True), 98 | # state size. (ndf*2) x 16 x 16 99 | nn.Conv2d(ndf * 2, ndf * 4, 4, 2, 1, bias=False), 100 | nn.BatchNorm2d(ndf * 4), 101 | nn.LeakyReLU(0.2, inplace=True), 102 | # state size. (ndf*4) x 8 x 8 103 | nn.Conv2d(ndf * 4, ndf * 8, 4, 2, 1, bias=False), 104 | nn.BatchNorm2d(ndf * 8), 105 | nn.LeakyReLU(0.2, inplace=True), 106 | # state size. (ndf*8) x 4 x 4 107 | nn.Conv2d(ndf * 8, 1, 4, 1, 0, bias=False), 108 | nn.Sigmoid() 109 | ) 110 | 111 | def forward(self, input): 112 | if input.is_cuda and self.ngpu > 1: 113 | output = nn.parallel.data_parallel(self.main, input, range(self.ngpu)) 114 | else: 115 | output = self.main(input) 116 | 117 | return output.view(-1, 1).squeeze(1) 118 | -------------------------------------------------------------------------------- /experiments/dcgan/entry_point.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from . import dcgan 5 | 6 | 7 | def skyline_model_provider(numgpu=1): 8 | netG = dcgan.Generator(numgpu).cuda() 9 | netG.apply(dcgan.weights_init) 10 | netD = dcgan.Discriminator(numgpu).cuda() 11 | netD.apply(dcgan.weights_init) 12 | return netG, netD 13 | 14 | 15 | def skyline_input_provider(batch_size=64): 16 | return ( 17 | batch_size, 18 | torch.randn((batch_size, 3, 64, 64)).cuda(), 19 | ) 20 | 21 | 22 | def skyline_iteration_provider(netG, netD): 23 | real_label = 1 24 | fake_label = 0 25 | opt = dcgan.model_config() 26 | 27 | optimizerD = torch.optim.Adam(netD.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) 28 | optimizerG = torch.optim.Adam(netG.parameters(), lr=opt.lr, betas=(opt.beta1, 0.999)) 29 | 30 | criterion = nn.BCELoss() 31 | 32 | device = torch.device("cuda") 33 | 34 | def iteration(*inputs): 35 | #for i, data in enumerate(dataloader, 0): 36 | ############################ 37 | # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z))) 38 | ########################### 39 | batch_size, data = inputs 40 | # train with real 41 | netD.zero_grad() 42 | real_cpu = data.to(device) 43 | label = torch.full((batch_size,), real_label, 44 | dtype=real_cpu.dtype, device=device) 45 | output = netD(real_cpu) 46 | errD_real = criterion(output, label) 47 | errD_real.backward() 48 | 49 | # train with fake 50 | noise = torch.randn(batch_size, dcgan.nz, 1, 1, device=device) 51 | fake = netG(noise) 52 | label.fill_(fake_label) 53 | output = netD(fake.detach()) 54 | errD_fake = criterion(output, label) 55 | errD_fake.backward() 56 | optimizerD.step() 57 | 58 | ############################ 59 | # (2) Update G network: maximize log(D(G(z))) 60 | ########################### 61 | netG.zero_grad() 62 | label.fill_(real_label) # fake labels are real for generator cost 63 | output = netD(fake) 64 | errG = criterion(output, label) 65 | errG.backward() 66 | optimizerG.step() 67 | return iteration 68 | -------------------------------------------------------------------------------- /experiments/gather_raw_data.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | set -e 4 | 5 | # Operate out of the script directory 6 | SCRIPT_PATH=$(cd $(dirname $0) && pwd -P) 7 | cd $SCRIPT_PATH 8 | 9 | if [ -z "$1" ]; then 10 | echo "Usage: $0 " 11 | exit 1 12 | fi 13 | 14 | python3 run_experiment.py $1 15 | tar cvzf hv2-$1.tar.gz *.csv 16 | rm *.csv 17 | -------------------------------------------------------------------------------- /experiments/gnmt/README.md: -------------------------------------------------------------------------------- 1 | # GNMT (Google Neural Machine Translation) Model 2 | 3 | This directory contains an implementation of GNMT that was adapted from the 4 | code found in the [MLPerf training 5 | repository](https://github.com/mlperf/training/tree/master/rnn_translator). 6 | 7 | ## License 8 | 9 | This code, with the exception of the `skyline_` prefixed functions in 10 | `entry_point.py`, was adapted from the MLPerf training benchmarks and therefore 11 | shares the same license. The unmodified license can be found in the `LICENSE` 12 | file in the `seq2seq` directory. 13 | -------------------------------------------------------------------------------- /experiments/gnmt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geoffxy/habitat/5f01e523a1dc30dbfbaaa39cf4880a534c7781a2/experiments/gnmt/__init__.py -------------------------------------------------------------------------------- /experiments/gnmt/seq2seq/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Elad Hoffer 4 | Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /experiments/gnmt/seq2seq/data/config.py: -------------------------------------------------------------------------------- 1 | PAD_TOKEN = '' 2 | UNK_TOKEN = '' 3 | BOS_TOKEN = '' 4 | EOS_TOKEN = '<\s>' 5 | 6 | # special PAD, UNKNOWN, BEGIN-OF-STRING, END-OF-STRING tokens 7 | PAD, UNK, BOS, EOS = [0, 1, 2, 3] 8 | 9 | # path to the BPE vocabulary file, relative to the data directory, it should 10 | # point to file generated by subword-nmt/get_vocab.py 11 | VOCAB_FNAME = 'vocab.bpe.32000' 12 | 13 | # paths to source and target training files, relative to the data directory, it 14 | # should point to BPE-encoded files, generated by subword-nmt/apply_bpe.py 15 | SRC_TRAIN_FNAME = 'train.tok.clean.bpe.32000.en' 16 | TGT_TRAIN_FNAME = 'train.tok.clean.bpe.32000.de' 17 | 18 | # paths to source and target validation files, relative to the data directory, 19 | # it should point to BPE-encoded files, generated by subword-nmt/apply_bpe.py 20 | SRC_VAL_FNAME = 'newstest_dev.tok.clean.bpe.32000.en' 21 | TGT_VAL_FNAME = 'newstest_dev.tok.clean.bpe.32000.de' 22 | 23 | # path to the test source file, relative to the data directory, it should point 24 | # to BPE-encoded file, generated by subword-nmt/apply_bpe.py 25 | SRC_TEST_FNAME = 'newstest2014.tok.bpe.32000.en' 26 | 27 | # path to the test target file, relative to the data directory, it should point 28 | # to plaintext file, tokenization is performed by the sacrebleu package 29 | TGT_TEST_TARGET_FNAME = 'newstest2014.de' 30 | 31 | # path to the moses detokenizer, relative to the data directory 32 | DETOKENIZER = 'mosesdecoder/scripts/tokenizer/detokenizer.perl' 33 | -------------------------------------------------------------------------------- /experiments/gnmt/seq2seq/data/tokenizer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections import defaultdict 3 | from functools import partial 4 | 5 | import gnmt.seq2seq.data.config as config 6 | 7 | 8 | class Tokenizer: 9 | """ 10 | Tokenizer class. 11 | """ 12 | def __init__(self, vocab_fname=None, pad=1, separator='@@'): 13 | """ 14 | Constructor for the Tokenizer class. 15 | 16 | :param vocab_fname: path to the file with vocabulary 17 | :param pad: pads vocabulary to a multiple of 'pad' tokens 18 | :param separator: tokenization separator 19 | """ 20 | if vocab_fname: 21 | self.separator = separator 22 | 23 | logging.info(f'Building vocabulary from {vocab_fname}') 24 | vocab = [config.PAD_TOKEN, config.UNK_TOKEN, 25 | config.BOS_TOKEN, config.EOS_TOKEN] 26 | 27 | with open(vocab_fname) as vfile: 28 | for line in vfile: 29 | vocab.append(line.strip()) 30 | 31 | self.pad_vocabulary(vocab, pad) 32 | 33 | self.vocab_size = len(vocab) 34 | logging.info(f'Size of vocabulary: {self.vocab_size}') 35 | 36 | self.tok2idx = defaultdict(partial(int, config.UNK)) 37 | for idx, token in enumerate(vocab): 38 | self.tok2idx[token] = idx 39 | 40 | self.idx2tok = {} 41 | for key, value in self.tok2idx.items(): 42 | self.idx2tok[value] = key 43 | 44 | def pad_vocabulary(self, vocab, pad): 45 | """ 46 | Pads vocabulary to a multiple of 'pad' tokens. 47 | 48 | :param vocab: list with vocabulary 49 | :param pad: integer 50 | """ 51 | vocab_size = len(vocab) 52 | padded_vocab_size = (vocab_size + pad - 1) // pad * pad 53 | for i in range(0, padded_vocab_size - vocab_size): 54 | token = f'madeupword{i:04d}' 55 | vocab.append(token) 56 | assert len(vocab) % pad == 0 57 | 58 | def get_state(self): 59 | logging.info(f'Saving state of the tokenizer') 60 | state = { 61 | 'separator': self.separator, 62 | 'vocab_size': self.vocab_size, 63 | 'tok2idx': self.tok2idx, 64 | 'idx2tok': self.idx2tok, 65 | } 66 | return state 67 | 68 | def set_state(self, state): 69 | logging.info(f'Restoring state of the tokenizer') 70 | self.separator = state['separator'] 71 | self.vocab_size = state['vocab_size'] 72 | self.tok2idx = state['tok2idx'] 73 | self.idx2tok = state['idx2tok'] 74 | 75 | def segment(self, line): 76 | """ 77 | Tokenizes single sentence and adds special BOS and EOS tokens. 78 | 79 | :param line: sentence 80 | 81 | returns: list representing tokenized sentence 82 | """ 83 | line = line.strip().split() 84 | entry = [self.tok2idx[i] for i in line] 85 | entry = [config.BOS] + entry + [config.EOS] 86 | return entry 87 | 88 | def detokenize(self, inputs, delim=' '): 89 | """ 90 | Detokenizes single sentence and removes token separator characters. 91 | 92 | :param inputs: sequence of tokens 93 | :param delim: tokenization delimiter 94 | 95 | returns: string representing detokenized sentence 96 | """ 97 | detok = delim.join([self.idx2tok[idx] for idx in inputs]) 98 | detok = detok.replace(self.separator + ' ', '') 99 | detok = detok.replace(self.separator, '') 100 | 101 | detok = detok.replace(config.BOS_TOKEN, '') 102 | detok = detok.replace(config.EOS_TOKEN, '') 103 | detok = detok.replace(config.PAD_TOKEN, '') 104 | detok = detok.strip() 105 | return detok 106 | -------------------------------------------------------------------------------- /experiments/gnmt/seq2seq/models/encoder.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from torch.nn.utils.rnn import pack_padded_sequence 3 | from torch.nn.utils.rnn import pad_packed_sequence 4 | 5 | import gnmt.seq2seq.data.config as config 6 | from gnmt.seq2seq.utils import init_lstm_ 7 | 8 | 9 | class ResidualRecurrentEncoder(nn.Module): 10 | """ 11 | Encoder with Embedding, LSTM layers, residual connections and optional 12 | dropout. 13 | 14 | The first LSTM layer is bidirectional and uses variable sequence length 15 | API, the remaining (num_layers-1) layers are unidirectional. Residual 16 | connections are enabled after third LSTM layer, dropout is applied on 17 | inputs to LSTM layers. 18 | """ 19 | def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2, 20 | batch_first=False, embedder=None, init_weight=0.1): 21 | """ 22 | Constructor for the ResidualRecurrentEncoder. 23 | 24 | :param vocab_size: size of vocabulary 25 | :param hidden_size: hidden size for LSTM layers 26 | :param num_layers: number of LSTM layers, 1st layer is bidirectional 27 | :param dropout: probability of dropout (on input to LSTM layers) 28 | :param batch_first: if True the model uses (batch,seq,feature) tensors, 29 | if false the model uses (seq, batch, feature) 30 | :param embedder: instance of nn.Embedding, if None constructor will 31 | create new embedding layer 32 | :param init_weight: range for the uniform initializer 33 | """ 34 | super(ResidualRecurrentEncoder, self).__init__() 35 | self.batch_first = batch_first 36 | self.rnn_layers = nn.ModuleList() 37 | # 1st LSTM layer, bidirectional 38 | self.rnn_layers.append( 39 | nn.LSTM(hidden_size, hidden_size, num_layers=1, bias=True, 40 | batch_first=batch_first, bidirectional=True)) 41 | 42 | # 2nd LSTM layer, with 2x larger input_size 43 | self.rnn_layers.append( 44 | nn.LSTM((2 * hidden_size), hidden_size, num_layers=1, bias=True, 45 | batch_first=batch_first)) 46 | 47 | # Remaining LSTM layers 48 | for _ in range(num_layers - 2): 49 | self.rnn_layers.append( 50 | nn.LSTM(hidden_size, hidden_size, num_layers=1, bias=True, 51 | batch_first=batch_first)) 52 | 53 | for lstm in self.rnn_layers: 54 | init_lstm_(lstm, init_weight) 55 | 56 | self.dropout = nn.Dropout(p=dropout) 57 | 58 | if embedder is not None: 59 | self.embedder = embedder 60 | else: 61 | self.embedder = nn.Embedding(vocab_size, hidden_size, 62 | padding_idx=config.PAD) 63 | nn.init.uniform_(self.embedder.weight.data, -init_weight, init_weight) 64 | 65 | def forward(self, inputs, lengths): 66 | """ 67 | Execute the encoder. 68 | 69 | :param inputs: tensor with indices from the vocabulary 70 | :param lengths: vector with sequence lengths (excluding padding) 71 | 72 | returns: tensor with encoded sequences 73 | """ 74 | x = self.embedder(inputs) 75 | 76 | # bidirectional layer 77 | x = self.dropout(x) 78 | x = pack_padded_sequence(x, lengths.cpu().numpy(), 79 | batch_first=self.batch_first) 80 | x, _ = self.rnn_layers[0](x) 81 | x, _ = pad_packed_sequence(x, batch_first=self.batch_first) 82 | 83 | # 1st unidirectional layer 84 | x = self.dropout(x) 85 | x, _ = self.rnn_layers[1](x) 86 | 87 | # the rest of unidirectional layers, 88 | # with residual connections starting from 3rd layer 89 | for i in range(2, len(self.rnn_layers)): 90 | residual = x 91 | x = self.dropout(x) 92 | x, _ = self.rnn_layers[i](x) 93 | x = x + residual 94 | 95 | return x 96 | -------------------------------------------------------------------------------- /experiments/gnmt/seq2seq/models/gnmt.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | import gnmt.seq2seq.data.config as config 4 | from gnmt.seq2seq.models.decoder import ResidualRecurrentDecoder 5 | from gnmt.seq2seq.models.encoder import ResidualRecurrentEncoder 6 | from gnmt.seq2seq.models.seq2seq_base import Seq2Seq 7 | from gnmt.seq2seq.utils import gnmt_print 8 | 9 | 10 | class GNMT(Seq2Seq): 11 | """ 12 | GNMT v2 model 13 | """ 14 | def __init__(self, vocab_size, hidden_size=1024, num_layers=4, dropout=0.2, 15 | batch_first=False, share_embedding=True): 16 | """ 17 | Constructor for the GNMT v2 model. 18 | 19 | :param vocab_size: size of vocabulary (number of tokens) 20 | :param hidden_size: internal hidden size of the model 21 | :param num_layers: number of layers, applies to both encoder and 22 | decoder 23 | :param dropout: probability of dropout (in encoder and decoder) 24 | :param batch_first: if True the model uses (batch,seq,feature) tensors, 25 | if false the model uses (seq, batch, feature) 26 | :param share_embedding: if True embeddings are shared between encoder 27 | and decoder 28 | """ 29 | 30 | super(GNMT, self).__init__(batch_first=batch_first) 31 | 32 | if share_embedding: 33 | embedder = nn.Embedding(vocab_size, hidden_size, 34 | padding_idx=config.PAD) 35 | nn.init.uniform_(embedder.weight.data, -0.1, 0.1) 36 | else: 37 | embedder = None 38 | 39 | self.encoder = ResidualRecurrentEncoder(vocab_size, hidden_size, 40 | num_layers, dropout, 41 | batch_first, embedder) 42 | 43 | self.decoder = ResidualRecurrentDecoder(vocab_size, hidden_size, 44 | num_layers, dropout, 45 | batch_first, embedder) 46 | 47 | def forward(self, input_encoder, input_enc_len, input_decoder): 48 | context = self.encode(input_encoder, input_enc_len) 49 | context = (context, input_enc_len, None) 50 | output, _, _ = self.decode(input_decoder, context) 51 | 52 | return output 53 | -------------------------------------------------------------------------------- /experiments/gnmt/seq2seq/models/seq2seq_base.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from torch.nn.functional import log_softmax 3 | 4 | 5 | class Seq2Seq(nn.Module): 6 | """ 7 | Generic Seq2Seq module, with an encoder and a decoder. 8 | """ 9 | def __init__(self, encoder=None, decoder=None, batch_first=False): 10 | """ 11 | Constructor for the Seq2Seq module. 12 | 13 | :param encoder: encoder module 14 | :param decoder: decoder module 15 | :param batch_first: if True the model uses (batch, seq, feature) 16 | tensors, if false the model uses (seq, batch, feature) tensors 17 | """ 18 | super(Seq2Seq, self).__init__() 19 | self.encoder = encoder 20 | self.decoder = decoder 21 | self.batch_first = batch_first 22 | 23 | def encode(self, inputs, lengths): 24 | """ 25 | Applies the encoder to inputs with a given input sequence lengths. 26 | 27 | :param inputs: tensor with inputs (batch, seq_len) if 'batch_first' 28 | else (seq_len, batch) 29 | :param lengths: vector with sequence lengths (excluding padding) 30 | """ 31 | return self.encoder(inputs, lengths) 32 | 33 | def decode(self, inputs, context, inference=False): 34 | """ 35 | Applies the decoder to inputs, given the context from the encoder. 36 | 37 | :param inputs: tensor with inputs (batch, seq_len) if 'batch_first' 38 | else (seq_len, batch) 39 | :param context: context from the encoder 40 | :param inference: if True inference mode, if False training mode 41 | """ 42 | return self.decoder(inputs, context, inference) 43 | 44 | def generate(self, inputs, context, beam_size): 45 | """ 46 | Autoregressive generator, works with SequenceGenerator class. 47 | Executes decoder (in inference mode), applies log_softmax and topK for 48 | inference with beam search decoding. 49 | 50 | :param inputs: tensor with inputs to the decoder 51 | :param context: context from the encoder 52 | :param beam_size: beam size for the generator 53 | 54 | returns: (words, logprobs, scores, new_context) 55 | words: indices of topK tokens 56 | logprobs: log probabilities of topK tokens 57 | scores: scores from the attention module (for coverage penalty) 58 | new_context: new decoder context, includes new hidden states for 59 | decoder RNN cells 60 | """ 61 | logits, scores, new_context = self.decode(inputs, context, True) 62 | logprobs = log_softmax(logits, dim=-1) 63 | logprobs, words = logprobs.topk(beam_size, dim=-1) 64 | return words, logprobs, scores, new_context 65 | -------------------------------------------------------------------------------- /experiments/gnmt/seq2seq/train/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | 4 | import torch 5 | 6 | from gnmt.seq2seq.utils import gnmt_print 7 | 8 | 9 | def perhaps_convert_float(param, total): 10 | if isinstance(param, float): 11 | param = int(param * total) 12 | return param 13 | 14 | 15 | class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): 16 | """ 17 | Learning rate scheduler with exponential warmup and step decay. 18 | """ 19 | def __init__(self, optimizer, iterations, warmup_steps=0, 20 | remain_steps=1.0, decay_interval=None, decay_steps=4, 21 | decay_factor=0.5, last_epoch=-1): 22 | """ 23 | Constructor of WarmupMultiStepLR. 24 | 25 | Parameters: warmup_steps, remain_steps and decay_interval accept both 26 | integers and floats as an input. Integer input is interpreted as 27 | absolute index of iteration, float input is interpreted as a fraction 28 | of total training iterations (epochs * steps_per_epoch). 29 | 30 | If decay_interval is None then the decay will happen at regulary spaced 31 | intervals ('decay_steps' decays between iteration indices 32 | 'remain_steps' and 'iterations'). 33 | 34 | :param optimizer: instance of optimizer 35 | :param iterations: total number of training iterations 36 | :param warmup_steps: number of warmup iterations 37 | :param remain_steps: start decay at 'remain_steps' iteration 38 | :param decay_interval: interval between LR decay steps 39 | :param decay_steps: max number of decay steps 40 | :param decay_factor: decay factor 41 | :param last_epoch: the index of last iteration 42 | """ 43 | 44 | # iterations before learning rate reaches base LR 45 | self.warmup_steps = perhaps_convert_float(warmup_steps, iterations) 46 | 47 | # iteration at which decay starts 48 | self.remain_steps = perhaps_convert_float(remain_steps, iterations) 49 | 50 | # number of steps between each decay 51 | if decay_interval is None: 52 | # decay at regulary spaced intervals 53 | decay_iterations = iterations - self.remain_steps 54 | self.decay_interval = decay_iterations // (decay_steps) 55 | self.decay_interval = max(self.decay_interval, 1) 56 | else: 57 | self.decay_interval = perhaps_convert_float(decay_interval, 58 | iterations) 59 | 60 | # multiplicative decay factor 61 | self.decay_factor = decay_factor 62 | 63 | # max number of decay steps 64 | self.decay_steps = decay_steps 65 | 66 | if self.warmup_steps > self.remain_steps: 67 | logging.warn(f'warmup_steps should not be larger than ' 68 | f'remain_steps, setting warmup_steps=remain_steps') 69 | self.warmup_steps = self.remain_steps 70 | 71 | super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch) 72 | 73 | def get_lr(self): 74 | if self.last_epoch <= self.warmup_steps: 75 | # exponential lr warmup 76 | if self.warmup_steps != 0: 77 | warmup_factor = math.exp(math.log(0.01) / self.warmup_steps) 78 | else: 79 | warmup_factor = 1.0 80 | inv_decay = warmup_factor ** (self.warmup_steps - self.last_epoch) 81 | lr = [base_lr * inv_decay for base_lr in self.base_lrs] 82 | 83 | elif self.last_epoch >= self.remain_steps: 84 | # step decay 85 | decay_iter = self.last_epoch - self.remain_steps 86 | num_decay_steps = decay_iter // self.decay_interval + 1 87 | num_decay_steps = min(num_decay_steps, self.decay_steps) 88 | lr = [ 89 | base_lr * (self.decay_factor ** num_decay_steps) 90 | for base_lr in self.base_lrs 91 | ] 92 | else: 93 | # base lr 94 | lr = [base_lr for base_lr in self.base_lrs] 95 | return lr 96 | -------------------------------------------------------------------------------- /experiments/gnmt/seq2seq/train/smoothing.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class LabelSmoothing(nn.Module): 6 | """ 7 | NLL loss with label smoothing. 8 | """ 9 | def __init__(self, padding_idx, smoothing=0.0): 10 | """ 11 | Constructor for the LabelSmoothing module. 12 | 13 | :param padding_idx: index of the PAD token 14 | :param smoothing: label smoothing factor 15 | """ 16 | super(LabelSmoothing, self).__init__() 17 | self.padding_idx = padding_idx 18 | self.confidence = 1.0 - smoothing 19 | self.smoothing = smoothing 20 | 21 | def forward(self, x, target): 22 | logprobs = torch.nn.functional.log_softmax(x, dim=-1, 23 | dtype=torch.float32) 24 | 25 | non_pad_mask = (target != self.padding_idx) 26 | nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1)) 27 | nll_loss = nll_loss.squeeze(1)[non_pad_mask] 28 | smooth_loss = -logprobs.mean(dim=-1)[non_pad_mask] 29 | loss = self.confidence * nll_loss + self.smoothing * smooth_loss 30 | return loss.sum() 31 | -------------------------------------------------------------------------------- /experiments/inception/LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) Soumith Chintala 2016, 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /experiments/inception/README.md: -------------------------------------------------------------------------------- 1 | # Inception v3 2 | The code inside this direectory is adapted from the Inception v3 code in 3 | `torchvision` and is covered by the BSD 3-Clause License. See the LICENSE file 4 | in this directory for more information. 5 | 6 | https://github.com/pytorch/vision/blob/master/torchvision/models/inception.py 7 | 8 | -------------------------------------------------------------------------------- /experiments/inception/entry_point.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from . import inception 5 | 6 | 7 | def skyline_model_provider(): 8 | return inception.inception_v3(init_weights=False, aux_logits=False).cuda() 9 | 10 | 11 | def skyline_input_provider(batch_size=16): 12 | return ( 13 | torch.randn((batch_size, 3, 299, 299)).cuda(), 14 | torch.randint(low=0, high=1000, size=(batch_size,)).cuda(), 15 | ) 16 | 17 | 18 | def skyline_iteration_provider(model): 19 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) 20 | loss_fn = torch.nn.CrossEntropyLoss() 21 | def iteration(*inputs): 22 | data, labels = inputs 23 | optimizer.zero_grad() 24 | out = model(data) 25 | out = loss_fn(out, labels) 26 | out.backward() 27 | optimizer.step() 28 | return iteration 29 | -------------------------------------------------------------------------------- /experiments/process_raw_data.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | set -e 4 | 5 | # Operate out of the script directory 6 | SCRIPT_PATH=$(cd $(dirname $0) && pwd -P) 7 | cd $SCRIPT_PATH 8 | 9 | RESULTS_DIR="results/results-$(date "+%F_%H_%M")" 10 | 11 | mkdir -p results 12 | mkdir $RESULTS_DIR 13 | mkdir $RESULTS_DIR/raw 14 | mkdir $RESULTS_DIR/ops 15 | mkdir $RESULTS_DIR/e2e 16 | mkdir $RESULTS_DIR/archives 17 | 18 | for archive in $(ls *.tar.gz); do 19 | tar xvzf $archive -C $RESULTS_DIR/raw 20 | done 21 | 22 | python3 process_results.py \ 23 | --in-dir $RESULTS_DIR/raw \ 24 | --out-ops $RESULTS_DIR/ops \ 25 | --out-e2e $RESULTS_DIR/e2e 26 | 27 | mv *.tar.gz $RESULTS_DIR/archives 28 | 29 | -------------------------------------------------------------------------------- /experiments/resnet/LICENSE: -------------------------------------------------------------------------------- 1 | NOTE: This license and disclaimer applies only to the "resnet.py" file in this 2 | directory. 3 | 4 | BSD 3-Clause License 5 | 6 | Copyright (c) Soumith Chintala 2016, 7 | All rights reserved. 8 | 9 | Redistribution and use in source and binary forms, with or without 10 | modification, are permitted provided that the following conditions are met: 11 | 12 | * Redistributions of source code must retain the above copyright notice, this 13 | list of conditions and the following disclaimer. 14 | 15 | * Redistributions in binary form must reproduce the above copyright notice, 16 | this list of conditions and the following disclaimer in the documentation 17 | and/or other materials provided with the distribution. 18 | 19 | * Neither the name of the copyright holder nor the names of its 20 | contributors may be used to endorse or promote products derived from 21 | this software without specific prior written permission. 22 | 23 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 24 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 26 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 27 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 29 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 30 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 31 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 | -------------------------------------------------------------------------------- /experiments/resnet/README.md: -------------------------------------------------------------------------------- 1 | # ResNet 2 | The code inside this direectory is adapted from the ResNet code in `torchvision` 3 | and is covered by the BSD 3-Clause License. See the LICENSE file in this 4 | directory for more information. 5 | 6 | https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py 7 | 8 | -------------------------------------------------------------------------------- /experiments/resnet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geoffxy/habitat/5f01e523a1dc30dbfbaaa39cf4880a534c7781a2/experiments/resnet/__init__.py -------------------------------------------------------------------------------- /experiments/resnet/entry_point.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from . import resnet 5 | 6 | 7 | def skyline_model_provider(): 8 | return resnet.resnet50().cuda() 9 | 10 | 11 | def skyline_input_provider(batch_size=16): 12 | return ( 13 | torch.randn((batch_size, 3, 224, 224)).cuda(), 14 | torch.randint(low=0, high=1000, size=(batch_size,)).cuda(), 15 | ) 16 | 17 | 18 | def skyline_iteration_provider(model): 19 | optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) 20 | def iteration(*inputs): 21 | optimizer.zero_grad() 22 | out = model(*inputs) 23 | out.backward() 24 | optimizer.step() 25 | return iteration 26 | -------------------------------------------------------------------------------- /experiments/transformer/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Victor Huang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /experiments/transformer/README.md: -------------------------------------------------------------------------------- 1 | Transformer Model (Attention is All You Need) 2 | ============================================= 3 | The `tfmr` directory contains a PyTorch implementation of the Transformer model 4 | described in the "[Attention is All You 5 | Need](https://arxiv.org/abs/1706.03762)" paper. This code was adapted from 6 | Yu-Hsiang Huang's implementation found in 7 | [jadore801120/attention-is-all-you-need-pytorch](https://github.com/jadore801120/attention-is-all-you-need-pytorch). 8 | 9 | License 10 | ------- 11 | The code inside this directory is adapted from Yu-Hsiang Huang's implementation 12 | and therefore shares the same license. The unmodified license can be found in 13 | the `LICENSE` file. 14 | -------------------------------------------------------------------------------- /experiments/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geoffxy/habitat/5f01e523a1dc30dbfbaaa39cf4880a534c7781a2/experiments/transformer/__init__.py -------------------------------------------------------------------------------- /experiments/transformer/tfmr/Beam.py: -------------------------------------------------------------------------------- 1 | """ Manage beam search info structure. 2 | 3 | Heavily borrowed from OpenNMT-py. 4 | For code in OpenNMT-py, please check the following link: 5 | https://github.com/OpenNMT/OpenNMT-py/blob/master/onmt/Beam.py 6 | """ 7 | 8 | import torch 9 | import numpy as np 10 | import transformer.tfmr.Constants as Constants 11 | 12 | class Beam(): 13 | ''' Beam search ''' 14 | 15 | def __init__(self, size, device=False): 16 | 17 | self.size = size 18 | self._done = False 19 | 20 | # The score for each translation on the beam. 21 | self.scores = torch.zeros((size,), dtype=torch.float, device=device) 22 | self.all_scores = [] 23 | 24 | # The backpointers at each time-step. 25 | self.prev_ks = [] 26 | 27 | # The outputs at each time-step. 28 | self.next_ys = [torch.full((size,), Constants.PAD, dtype=torch.long, device=device)] 29 | self.next_ys[0][0] = Constants.BOS 30 | 31 | def get_current_state(self): 32 | "Get the outputs for the current timestep." 33 | return self.get_tentative_hypothesis() 34 | 35 | def get_current_origin(self): 36 | "Get the backpointers for the current timestep." 37 | return self.prev_ks[-1] 38 | 39 | @property 40 | def done(self): 41 | return self._done 42 | 43 | def advance(self, word_prob): 44 | "Update beam status and check if finished or not." 45 | num_words = word_prob.size(1) 46 | 47 | # Sum the previous scores. 48 | if len(self.prev_ks) > 0: 49 | beam_lk = word_prob + self.scores.unsqueeze(1).expand_as(word_prob) 50 | else: 51 | beam_lk = word_prob[0] 52 | 53 | flat_beam_lk = beam_lk.view(-1) 54 | 55 | best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, True) # 1st sort 56 | best_scores, best_scores_id = flat_beam_lk.topk(self.size, 0, True, True) # 2nd sort 57 | 58 | self.all_scores.append(self.scores) 59 | self.scores = best_scores 60 | 61 | # bestScoresId is flattened as a (beam x word) array, 62 | # so we need to calculate which word and beam each score came from 63 | prev_k = best_scores_id / num_words 64 | self.prev_ks.append(prev_k) 65 | self.next_ys.append(best_scores_id - prev_k * num_words) 66 | 67 | # End condition is when top-of-beam is EOS. 68 | if self.next_ys[-1][0].item() == Constants.EOS: 69 | self._done = True 70 | self.all_scores.append(self.scores) 71 | 72 | return self._done 73 | 74 | def sort_scores(self): 75 | "Sort the scores." 76 | return torch.sort(self.scores, 0, True) 77 | 78 | def get_the_best_score_and_idx(self): 79 | "Get the score of the best in the beam." 80 | scores, ids = self.sort_scores() 81 | return scores[1], ids[1] 82 | 83 | def get_tentative_hypothesis(self): 84 | "Get the decoded sequence for the current timestep." 85 | 86 | if len(self.next_ys) == 1: 87 | dec_seq = self.next_ys[0].unsqueeze(1) 88 | else: 89 | _, keys = self.sort_scores() 90 | hyps = [self.get_hypothesis(k) for k in keys] 91 | hyps = [[Constants.BOS] + h for h in hyps] 92 | dec_seq = torch.LongTensor(hyps) 93 | 94 | return dec_seq 95 | 96 | def get_hypothesis(self, k): 97 | """ Walk back to construct the full hypothesis. """ 98 | hyp = [] 99 | for j in range(len(self.prev_ks) - 1, -1, -1): 100 | hyp.append(self.next_ys[j+1][k]) 101 | k = self.prev_ks[j][k] 102 | 103 | return list(map(lambda x: x.item(), hyp[::-1])) 104 | -------------------------------------------------------------------------------- /experiments/transformer/tfmr/Constants.py: -------------------------------------------------------------------------------- 1 | 2 | PAD = 0 3 | UNK = 1 4 | BOS = 2 5 | EOS = 3 6 | 7 | PAD_WORD = '' 8 | UNK_WORD = '' 9 | BOS_WORD = '' 10 | EOS_WORD = '' 11 | -------------------------------------------------------------------------------- /experiments/transformer/tfmr/Layers.py: -------------------------------------------------------------------------------- 1 | ''' Define the Layers ''' 2 | import torch.nn as nn 3 | from transformer.tfmr.SubLayers import MultiHeadAttention, PositionwiseFeedForward 4 | 5 | __author__ = "Yu-Hsiang Huang" 6 | 7 | 8 | class EncoderLayer(nn.Module): 9 | ''' Compose with two layers ''' 10 | 11 | def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): 12 | super(EncoderLayer, self).__init__() 13 | self.slf_attn = MultiHeadAttention( 14 | n_head, d_model, d_k, d_v, dropout=dropout) 15 | self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout) 16 | 17 | def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): 18 | enc_output, enc_slf_attn = self.slf_attn( 19 | enc_input, enc_input, enc_input, mask=slf_attn_mask) 20 | enc_output *= non_pad_mask 21 | 22 | enc_output = self.pos_ffn(enc_output) 23 | enc_output *= non_pad_mask 24 | 25 | return enc_output, enc_slf_attn 26 | 27 | 28 | class DecoderLayer(nn.Module): 29 | ''' Compose with three layers ''' 30 | 31 | def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): 32 | super(DecoderLayer, self).__init__() 33 | self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) 34 | self.enc_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) 35 | self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout) 36 | 37 | def forward(self, dec_input, enc_output, non_pad_mask=None, slf_attn_mask=None, dec_enc_attn_mask=None): 38 | dec_output, dec_slf_attn = self.slf_attn( 39 | dec_input, dec_input, dec_input, mask=slf_attn_mask) 40 | dec_output *= non_pad_mask 41 | 42 | dec_output, dec_enc_attn = self.enc_attn( 43 | dec_output, enc_output, enc_output, mask=dec_enc_attn_mask) 44 | dec_output *= non_pad_mask 45 | 46 | dec_output = self.pos_ffn(dec_output) 47 | dec_output *= non_pad_mask 48 | 49 | return dec_output, dec_slf_attn, dec_enc_attn 50 | -------------------------------------------------------------------------------- /experiments/transformer/tfmr/Modules.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | __author__ = "Yu-Hsiang Huang" 6 | 7 | class ScaledDotProductAttention(nn.Module): 8 | ''' Scaled Dot-Product Attention ''' 9 | 10 | def __init__(self, temperature, attn_dropout=0.1): 11 | super().__init__() 12 | self.temperature = temperature 13 | self.dropout = nn.Dropout(attn_dropout) 14 | self.softmax = nn.Softmax(dim=2) 15 | 16 | def forward(self, q, k, v, mask=None): 17 | 18 | attn = torch.bmm(q, k.transpose(1, 2)) 19 | attn = attn / self.temperature 20 | 21 | if mask is not None: 22 | attn = attn.masked_fill(mask, -np.inf) 23 | 24 | attn = self.softmax(attn) 25 | attn = self.dropout(attn) 26 | output = torch.bmm(attn, v) 27 | 28 | return output, attn 29 | -------------------------------------------------------------------------------- /experiments/transformer/tfmr/Optim.py: -------------------------------------------------------------------------------- 1 | '''A wrapper class for optimizer ''' 2 | import numpy as np 3 | 4 | class ScheduledOptim(): 5 | '''A simple wrapper class for learning rate scheduling''' 6 | 7 | def __init__(self, optimizer, d_model, n_warmup_steps): 8 | self._optimizer = optimizer 9 | self.n_warmup_steps = n_warmup_steps 10 | self.n_current_steps = 0 11 | self.init_lr = np.power(d_model, -0.5) 12 | 13 | def step_and_update_lr(self): 14 | "Step with the inner optimizer" 15 | self._update_learning_rate() 16 | self._optimizer.step() 17 | 18 | def zero_grad(self): 19 | "Zero out the gradients by the inner optimizer" 20 | self._optimizer.zero_grad() 21 | 22 | def _get_lr_scale(self): 23 | return np.min([ 24 | np.power(self.n_current_steps, -0.5), 25 | np.power(self.n_warmup_steps, -1.5) * self.n_current_steps]) 26 | 27 | def _update_learning_rate(self): 28 | ''' Learning rate scheduling per step ''' 29 | 30 | self.n_current_steps += 1 31 | lr = self.init_lr * self._get_lr_scale() 32 | 33 | for param_group in self._optimizer.param_groups: 34 | param_group['lr'] = lr 35 | 36 | -------------------------------------------------------------------------------- /experiments/transformer/tfmr/SubLayers.py: -------------------------------------------------------------------------------- 1 | ''' Define the sublayers in encoder/decoder layer ''' 2 | import numpy as np 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from transformer.tfmr.Modules import ScaledDotProductAttention 6 | 7 | __author__ = "Yu-Hsiang Huang" 8 | 9 | class MultiHeadAttention(nn.Module): 10 | ''' Multi-Head Attention module ''' 11 | 12 | def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): 13 | super().__init__() 14 | 15 | self.n_head = n_head 16 | self.d_k = d_k 17 | self.d_v = d_v 18 | 19 | self.w_qs = nn.Linear(d_model, n_head * d_k) 20 | self.w_ks = nn.Linear(d_model, n_head * d_k) 21 | self.w_vs = nn.Linear(d_model, n_head * d_v) 22 | nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) 23 | nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) 24 | nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v))) 25 | 26 | self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5)) 27 | self.layer_norm = nn.LayerNorm(d_model) 28 | 29 | self.fc = nn.Linear(n_head * d_v, d_model) 30 | nn.init.xavier_normal_(self.fc.weight) 31 | 32 | self.dropout = nn.Dropout(dropout) 33 | 34 | 35 | def forward(self, q, k, v, mask=None): 36 | 37 | d_k, d_v, n_head = self.d_k, self.d_v, self.n_head 38 | 39 | sz_b, len_q, _ = q.size() 40 | sz_b, len_k, _ = k.size() 41 | sz_b, len_v, _ = v.size() 42 | 43 | residual = q 44 | 45 | q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) 46 | k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) 47 | v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) 48 | 49 | q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk 50 | k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk 51 | v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv 52 | 53 | mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x .. 54 | output, attn = self.attention(q, k, v, mask=mask) 55 | 56 | output = output.view(n_head, sz_b, len_q, d_v) 57 | output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv) 58 | 59 | output = self.dropout(self.fc(output)) 60 | output = self.layer_norm(output + residual) 61 | 62 | return output, attn 63 | 64 | class PositionwiseFeedForward(nn.Module): 65 | ''' A two-feed-forward-layer module ''' 66 | 67 | def __init__(self, d_in, d_hid, dropout=0.1): 68 | super().__init__() 69 | self.w_1 = nn.Conv1d(d_in, d_hid, 1) # position-wise 70 | self.w_2 = nn.Conv1d(d_hid, d_in, 1) # position-wise 71 | self.layer_norm = nn.LayerNorm(d_in) 72 | self.dropout = nn.Dropout(dropout) 73 | 74 | def forward(self, x): 75 | residual = x 76 | output = x.transpose(1, 2) 77 | output = self.w_2(F.relu(self.w_1(output))) 78 | output = output.transpose(1, 2) 79 | output = self.dropout(output) 80 | output = self.layer_norm(output + residual) 81 | return output 82 | -------------------------------------------------------------------------------- /experiments/transformer/tfmr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geoffxy/habitat/5f01e523a1dc30dbfbaaa39cf4880a534c7781a2/experiments/transformer/tfmr/__init__.py -------------------------------------------------------------------------------- /tools/device-metadata/README.md: -------------------------------------------------------------------------------- 1 | ## Peak Performance 2 | 3 | Use `measure_peak_flops.py` to measure the peak performance (GFLOP/s) on a 4 | given GPU. Note that you need to run this script inside the Habitat container 5 | (or otherwise install the habitat-predictor Python package, which is located 6 | inside the analyzer top-level directory). 7 | -------------------------------------------------------------------------------- /tools/device-metadata/measure_peak_flops.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import statistics 3 | 4 | import habitat 5 | import habitat.habitat_cuda as hc 6 | from habitat.analysis.metrics import Metric 7 | from habitat.profiling.kernel import KernelProfiler 8 | 9 | 10 | def measure_peak_flops(profiler): 11 | results = profiler.measure_kernels(hc._diagnostics.run_flop_test) 12 | assert len(results) == 1 13 | kernel = results[0] 14 | gflops_per_second = ( 15 | kernel.get_metric(Metric.SinglePrecisionAddOps) / kernel.run_time_ns 16 | ) 17 | efficiency = kernel.get_metric(Metric.SinglePrecisionFLOPEfficiency) / 100 18 | return gflops_per_second / efficiency 19 | 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser( 23 | description="Measure the peak performance (FLOP/s) of a GPU." 24 | ) 25 | parser.add_argument("device", help="The current device (e.g., RTX2070).") 26 | parser.add_argument("--trials", type=int, default=5) 27 | args = parser.parse_args() 28 | 29 | profiler = KernelProfiler( 30 | getattr(habitat.Device, args.device), 31 | metrics=[ 32 | Metric.SinglePrecisionFLOPEfficiency, 33 | Metric.SinglePrecisionAddOps, 34 | ], 35 | ) 36 | 37 | results = [] 38 | for trial in range(args.trials): 39 | print("Running trial {}...".format(trial)) 40 | results.append(measure_peak_flops(profiler)) 41 | 42 | print("Peak Performance on the {}".format(args.device)) 43 | print("===============================") 44 | print("Median: {} GFLOP/s".format(statistics.median(results))) 45 | print("Mean: {} GFLOP/s".format(statistics.mean(results))) 46 | print("Max.: {} GFLOP/s".format(max(results))) 47 | print("Min.: {} GFLOP/s".format(min(results))) 48 | print("Trials: {}".format(args.trials)) 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /tools/kernel-metadata/extract.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | function usage() { 4 | echo "Usage: $0 path/to/libtorch.so" 5 | exit 1 6 | } 7 | 8 | if [ -z "$1" ] || [ -z "$2" ]; then 9 | usage "$@" 10 | fi 11 | 12 | DATABASE_NAME=$1 13 | LIBTORCH_PATH=$2 14 | 15 | declare -a SHARED_LIBS=( 16 | $(ldd $LIBTORCH_PATH | grep -E -o "/\S+") 17 | "$LIBTORCH_PATH" 18 | ) 19 | 20 | for shared_lib in ${SHARED_LIBS[@]}; do 21 | echo "Processing $shared_lib" 22 | cuobjdump -res-usage $shared_lib 2> /dev/null | \ 23 | python3 process-cuobjdump-output.py --database $DATABASE_NAME 24 | done 25 | 26 | echo "Done!" 27 | -------------------------------------------------------------------------------- /tools/kernel-metadata/process-cuobjdump-output.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sqlite3 3 | import sys 4 | import re 5 | 6 | ARCH_LINE_REGEX = re.compile('^arch = sm_(?P[0-9]+)$') 7 | FUNC_LINE_REGEX = re.compile('^\sFunction\s(?P.+):$') 8 | RES_LINE_REGEX = re.compile('^\s\sREG:(?P[0-9]+)\s.*$') 9 | 10 | 11 | class Parser: 12 | """ 13 | Parses cuobjdump output that uses the -res-usage flag. 14 | 15 | This parser is implemented using a coroutine. Use the consume() method to 16 | send input lines to the parser. The consume() method returns a parsed 17 | kernel or None (when more input is required). 18 | """ 19 | def __init__(self): 20 | self._impl = self._parser_coroutine() 21 | next(self._impl) 22 | 23 | def consume(self, line): 24 | result = self._impl.send(line) 25 | if result is not None: 26 | next(self._impl) 27 | return result 28 | 29 | def _parser_coroutine(self): 30 | arch = None 31 | 32 | while True: 33 | line = (yield)[:-1] 34 | 35 | arch_match = ARCH_LINE_REGEX.match(line) 36 | if arch_match is not None: 37 | arch = int(arch_match.group('arch')) 38 | continue 39 | 40 | func_line_match = FUNC_LINE_REGEX.match(line) 41 | if func_line_match is None: 42 | continue 43 | 44 | # When we find a function, we expect the next line to be its 45 | # corresponding resource string 46 | func_name = func_line_match.group('name') 47 | 48 | res_line = (yield)[:-1] 49 | 50 | resource_match = RES_LINE_REGEX.match(res_line) 51 | if resource_match is None: 52 | raise AssertionError( 53 | 'Missing resource information for function: ' + func_name) 54 | 55 | registers_per_thread = int(resource_match.group('registers')) 56 | yield (func_name, arch, registers_per_thread) 57 | 58 | 59 | def ensure_tables_exist(connection): 60 | create_table = """ 61 | CREATE TABLE IF NOT EXISTS kernels ( 62 | name TEXT NOT NULL, 63 | arch INT NOT NULL, 64 | registers_per_thread INT NOT NULL, 65 | PRIMARY KEY (name, arch) 66 | ) 67 | """ 68 | cursor = connection.cursor() 69 | cursor.execute(create_table) 70 | connection.commit() 71 | 72 | 73 | def insert_kernel(connection, name, arch, registers_per_thread): 74 | query = """ 75 | INSERT INTO kernels (name, arch, registers_per_thread) VALUES (?, ?, ?) 76 | """ 77 | cursor = connection.cursor() 78 | cursor.execute(query, (name, arch, registers_per_thread)) 79 | 80 | 81 | def process_cuobjdump_output(connection): 82 | parser = Parser() 83 | 84 | for line in iter(sys.stdin.readline, ''): 85 | kernel_info = parser.consume(line) 86 | if kernel_info is None: 87 | continue 88 | 89 | try: 90 | insert_kernel(connection, *kernel_info) 91 | except sqlite3.IntegrityError: 92 | # cuobjdump duplicates kernel entries - skip them for now 93 | pass 94 | 95 | 96 | def main(): 97 | parser = argparse.ArgumentParser() 98 | parser.add_argument('--database', type=str, required=True) 99 | args = parser.parse_args() 100 | 101 | connection = sqlite3.connect(args.database) 102 | ensure_tables_exist(connection) 103 | process_cuobjdump_output(connection) 104 | connection.commit() 105 | connection.close() 106 | 107 | 108 | if __name__ == '__main__': 109 | main() 110 | -------------------------------------------------------------------------------- /tools/recording/database.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import logging 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | FEATURES_TEMPLATE = '{feature} INTEGER NOT NULL,' 7 | 8 | 9 | class Recorder: 10 | def __init__(self, file_name, features): 11 | self._file_name = file_name 12 | self._features = features 13 | self._strings = {} 14 | self._connection, self._cursor = self._initialize() 15 | 16 | def _generate_queries(self): 17 | features_sql = ''.join(map( 18 | lambda f: FEATURES_TEMPLATE.format(feature=f), 19 | self._features, 20 | )) 21 | self._create_recordings = """ 22 | CREATE TABLE IF NOT EXISTS recordings ( 23 | id INTEGER PRIMARY KEY, 24 | {features} 25 | is_forward INTEGER NOT NULL, 26 | run_time_ms REAL NOT NULL 27 | ) 28 | """.format(features=features_sql) 29 | self._insert_recording = """ 30 | INSERT INTO recordings ( 31 | {features}, 32 | is_forward, 33 | run_time_ms 34 | ) 35 | VALUES ({values} ?, ?) 36 | """.format( 37 | features=','.join(self._features), 38 | values='?,' * len(self._features), 39 | ) 40 | 41 | def _initialize(self): 42 | self._generate_queries() 43 | connection = sqlite3.connect(self._file_name) 44 | cursor = connection.cursor() 45 | cursor.execute(self._create_recordings) 46 | cursor.execute(""" 47 | CREATE TABLE IF NOT EXISTS kernels ( 48 | id INTEGER PRIMARY KEY, 49 | recording_id INTEGER NOT NULL, 50 | kernel_name INTEGER NOT NULL, 51 | run_time_ns INTEGER NOT NULL 52 | ) 53 | """) 54 | cursor.execute(""" 55 | CREATE TABLE IF NOT EXISTS strings ( 56 | id INTEGER PRIMARY KEY, 57 | value TEXT NOT NULL 58 | ) 59 | """) 60 | connection.commit() 61 | return connection, cursor 62 | 63 | def get_num_recordings(self): 64 | self._cursor.execute("SELECT COUNT(*) FROM recordings") 65 | return self._cursor.fetchone()[0] 66 | 67 | def record(self, config, is_forward, run_time_ms, recorded_kernels): 68 | try: 69 | self._cursor.execute( 70 | self._insert_recording, 71 | (*tuple(map(int, config)), int(is_forward), run_time_ms), 72 | ) 73 | recording_id = self._cursor.lastrowid 74 | for kernel in recorded_kernels: 75 | if kernel.name in self._strings: 76 | kernel_name = self._strings[kernel.name] 77 | else: 78 | self._cursor.execute(Recorder.insert_string, (kernel.name,)) 79 | kernel_name = self._cursor.lastrowid 80 | self._strings[kernel.name] = kernel_name 81 | 82 | self._cursor.execute( 83 | Recorder.insert_kernel, 84 | (recording_id, kernel_name, kernel.run_time_ns) 85 | ) 86 | except OverflowError: 87 | logger.warn( 88 | 'Could not record a kernel because its run time overflowed the' 89 | 'SQLite integer datatype.' 90 | ) 91 | 92 | def commit(self): 93 | self._connection.commit() 94 | 95 | def __del__(self): 96 | self._connection.commit() 97 | self._connection.close() 98 | 99 | 100 | Recorder.insert_kernel = """ 101 | INSERT INTO kernels (recording_id, kernel_name, run_time_ns) VALUES (?, ?, ?) 102 | """ 103 | 104 | Recorder.insert_string = """ 105 | INSERT INTO strings (value) VALUES (?) 106 | """ 107 | -------------------------------------------------------------------------------- /tools/recording/features.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | conv2d = [ 4 | 'bias', 5 | 'batch', 6 | 'image_size', 7 | 'in_channels', 8 | 'out_channels', 9 | 'kernel_size', 10 | 'stride', 11 | 'padding', 12 | ] 13 | 14 | bmm = [ 15 | 'batch', 16 | # (batch, left, middle) x (batch, middle, right) 17 | 'left', 18 | 'middle', 19 | 'right', 20 | ] 21 | 22 | lstm = [ 23 | 'bias', # 0 or 1, represents the bias flag 24 | 'bidirectional', # 0 or 1, represents the bidirectional flag 25 | 'batch', 26 | 'seq_len', 27 | 'input_size', 28 | 'hidden_size', 29 | 'num_layers', 30 | ] 31 | 32 | linear = [ 33 | 'bias', 34 | 'batch', 35 | 'in_features', 36 | 'out_features', 37 | ] 38 | 39 | FEATURES = { 40 | 'bmm': bmm, 41 | 'conv2d': conv2d, 42 | 'linear': linear, 43 | 'lstm': lstm, 44 | } 45 | -------------------------------------------------------------------------------- /tools/recording/record_bmm.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | 4 | import torch 5 | from record_common import Measurer 6 | import features as f 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def index_to_config(args, index): 12 | batch = (index % args.batches) + 1 13 | index //= args.batches 14 | 15 | left = (index % args.left) + 1 16 | index //= args.left 17 | 18 | middle = (index % args.middle) + 1 19 | index //= args.middle 20 | 21 | right = index + 1 22 | 23 | return ( 24 | batch, 25 | left, 26 | middle, 27 | right, 28 | ) 29 | 30 | 31 | def config_to_profiler_args(config): 32 | (batch, left, middle, right) = config 33 | o1 = torch.randn((batch, left, middle)).cuda() 34 | o2 = torch.randn((batch, middle, right)).cuda() 35 | o1.requires_grad_() 36 | o2.requires_grad_() 37 | return { 38 | 'func': torch.bmm, 39 | 'args': (o1, o2), 40 | 'kwargs': {}, 41 | } 42 | 43 | 44 | def main(): 45 | measurer = Measurer( 46 | op_name='bmm', 47 | recorder_config=f.bmm, 48 | index_to_config=index_to_config, 49 | config_to_profiler_args=config_to_profiler_args, 50 | ) 51 | parser = argparse.ArgumentParser() 52 | measurer.add_args(parser) 53 | parser.add_argument('--batches', type=int, default=128) 54 | parser.add_argument('--left', type=int, default=1024) 55 | parser.add_argument('--middle', type=int, default=1024) 56 | parser.add_argument('--right', type=int, default=1024) 57 | args = parser.parse_args() 58 | 59 | num_configs = ( 60 | args.batches * 61 | args.left * 62 | args.middle * 63 | args.right 64 | ) 65 | measurer.measure_configurations(args, num_configs) 66 | 67 | 68 | if __name__ == '__main__': 69 | kwargs = { 70 | "format": "%(asctime)s %(levelname)-8s %(message)s", 71 | "datefmt": "%Y-%m-%d %H:%M", 72 | "level": logging.INFO, 73 | } 74 | logging.basicConfig(**kwargs) 75 | main() 76 | -------------------------------------------------------------------------------- /tools/recording/record_conv2d.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import math 4 | 5 | import torch 6 | from record_common import Measurer 7 | import features as f 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | MIN_IN_CHANNELS = 3 12 | MIN_OUT_CHANNELS = 16 13 | 14 | torch.backends.cudnn.benchmark = True 15 | 16 | 17 | def index_to_config(args, index): 18 | bias = False if index % 2 == 0 else True 19 | index //= 2 20 | 21 | batch = (index % args.batches) + 1 22 | index //= args.batches 23 | 24 | image_size = (index % args.image_size) + 1 25 | index //= args.image_size 26 | 27 | in_channels = (index % args.in_channels) + 1 28 | index //= args.in_channels 29 | 30 | out_channels = (index % args.out_channels) + 1 31 | index //= args.out_channels 32 | 33 | kernel_size = (index % args.kernel_size) + 1 34 | index //= args.kernel_size 35 | 36 | stride = (index % args.stride) + 1 37 | index //= args.stride 38 | 39 | # Padding is 0-based 40 | padding = index 41 | 42 | return ( 43 | bias, 44 | batch, 45 | image_size, 46 | in_channels, 47 | out_channels, 48 | kernel_size, 49 | stride, 50 | padding, 51 | ) 52 | 53 | 54 | def index_filter(args, index): 55 | config = index_to_config(args, index) 56 | # NOTE: We multiply because the dimensions have different ranges; we want 57 | # them to each "contribute equally". We weigh the image size more to 58 | # select smaller image sizes. 59 | # image_size (1-dim) * in_channels * out_channels * kernel_size 60 | conv_size = math.pow(config[2], 1.15) * config[3] * config[4] * config[5] 61 | 62 | # NOTE: This value was chosen arbitrarily: we don't want the in/out 63 | # channels and image size to all be too large. This way, large values 64 | # for the in/out channels would lead to a smaller image size (and 65 | # vice versa). 66 | return conv_size <= 35000000 67 | 68 | 69 | def config_to_profiler_args(config): 70 | (bias, 71 | batch, 72 | image_size, 73 | in_channels, 74 | out_channels, 75 | kernel_size, 76 | stride, 77 | padding) = config 78 | 79 | # Easiest way to exclude certain sample configurations 80 | if in_channels < MIN_IN_CHANNELS or out_channels < MIN_OUT_CHANNELS: 81 | return None 82 | 83 | device = torch.device('cuda') 84 | conv2d = torch.nn.Conv2d( 85 | in_channels=in_channels, 86 | out_channels=out_channels, 87 | kernel_size=kernel_size, 88 | stride=stride, 89 | padding=padding, 90 | bias=bias, 91 | ).to(device) 92 | inp = torch.randn(( 93 | batch, 94 | in_channels, 95 | image_size, 96 | image_size, 97 | ), device=device) 98 | # NOTE: This is important: for most convolutions, we will also need the 99 | # gradient with respect to the input to be able to backpropagate to 100 | # earlier operations in the network. 101 | inp = inp.requires_grad_() 102 | 103 | return { 104 | 'func': conv2d, 105 | 'args': (inp,), 106 | 'kwargs': {}, 107 | } 108 | 109 | 110 | def main(): 111 | measurer = Measurer( 112 | op_name='conv2d', 113 | recorder_config=f.conv2d, 114 | index_to_config=index_to_config, 115 | index_filter=index_filter, 116 | config_to_profiler_args=config_to_profiler_args, 117 | ) 118 | 119 | parser = argparse.ArgumentParser() 120 | measurer.add_args(parser) 121 | parser.add_argument('--batches', type=int, default=64) 122 | parser.add_argument('--image-size', type=int, default=256) 123 | parser.add_argument('--in-channels', type=int, default=2048) 124 | parser.add_argument('--out-channels', type=int, default=2048) 125 | parser.add_argument('--kernel-size', type=int, default=11) 126 | parser.add_argument('--stride', type=int, default=4) 127 | # Padding is 0-based, so this means we consider 0 to 3 inclusive 128 | parser.add_argument('--padding', type=int, default=4) 129 | args = parser.parse_args() 130 | 131 | num_configs = ( 132 | 2 * # Whether or not there is a bias 133 | args.batches * 134 | args.image_size * 135 | args.in_channels * 136 | args.out_channels * 137 | args.kernel_size * 138 | args.stride * 139 | args.padding 140 | ) 141 | 142 | # Conv2d has filtering, so we won't have exactly 200000 points (the 143 | # default). So here we increase the number of starting points. 144 | if args.num_points == 200000: 145 | args.num_points *= 6 146 | 147 | measurer.measure_configurations(args, num_configs) 148 | 149 | 150 | if __name__ == '__main__': 151 | kwargs = { 152 | "format": "%(asctime)s %(levelname)-8s %(message)s", 153 | "datefmt": "%Y-%m-%d %H:%M", 154 | "level": logging.INFO, 155 | } 156 | logging.basicConfig(**kwargs) 157 | main() 158 | -------------------------------------------------------------------------------- /tools/recording/record_linear.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | 4 | import torch 5 | from record_common import Measurer 6 | import features as f 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def index_to_config(args, index): 12 | bias = False if index % 2 == 0 else True 13 | index //= 2 14 | 15 | batch = (index % args.batches) + 1 16 | index //= args.batches 17 | 18 | in_features = (index % args.in_features) + 1 19 | index //= args.in_features 20 | 21 | out_features = index + 1 22 | 23 | return ( 24 | bias, 25 | batch, 26 | in_features, 27 | out_features, 28 | ) 29 | 30 | 31 | def config_to_profiler_args(config): 32 | (bias, batch, in_features, out_features) = config 33 | linear = torch.nn.Linear( 34 | in_features=in_features, out_features=out_features, bias=bias).cuda() 35 | inp = torch.randn((batch, in_features)).cuda() 36 | # NOTE: This is important: for most linear layers, we will also need the 37 | # gradient with respect to the input to be able to backpropagate to 38 | # earlier operations in the network. 39 | inp = inp.requires_grad_() 40 | return { 41 | 'func': linear, 42 | 'args': (inp,), 43 | 'kwargs': {}, 44 | } 45 | 46 | 47 | def index_filter(args, index): 48 | config = index_to_config(args, index) 49 | # NOTE: We multiply because the dimensions have different ranges; we want 50 | # them to each "contribute equally". We weigh the image size more to 51 | # select smaller image sizes. 52 | # batch * in_features * out_features 53 | linear_size = config[1] * config[2] * config[3] 54 | 55 | # NOTE: This value was chosen arbitrarily: we don't want the in/out 56 | # features and batch to all be too large. This way, large values 57 | # for the in/out features would lead to a smaller batch size (and 58 | # vice versa). 59 | return linear_size <= 840000000 60 | 61 | 62 | def main(): 63 | measurer = Measurer( 64 | op_name='linear', 65 | recorder_config=f.linear, 66 | index_to_config=index_to_config, 67 | index_filter=index_filter, 68 | config_to_profiler_args=config_to_profiler_args, 69 | ) 70 | parser = argparse.ArgumentParser() 71 | measurer.add_args(parser) 72 | parser.add_argument('--batches', type=int, default=3500) 73 | parser.add_argument('--in-features', type=int, default=32768) 74 | parser.add_argument('--out-features', type=int, default=32768) 75 | args = parser.parse_args() 76 | 77 | # Linear has filtering, so we won't have exactly 200000 points (the 78 | # default). So here we increase the number of starting points. 79 | if args.num_points == 200000: 80 | args.num_points *= 80 81 | 82 | num_configs = ( 83 | 2 * # Whether or not there is a bias 84 | args.batches * 85 | args.in_features * 86 | args.out_features 87 | ) 88 | measurer.measure_configurations(args, num_configs) 89 | 90 | 91 | if __name__ == '__main__': 92 | kwargs = { 93 | "format": "%(asctime)s %(levelname)-8s %(message)s", 94 | "datefmt": "%Y-%m-%d %H:%M", 95 | "level": logging.INFO, 96 | } 97 | logging.basicConfig(**kwargs) 98 | main() 99 | -------------------------------------------------------------------------------- /tools/recording/record_lstm.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | 4 | import torch 5 | from record_common import Measurer 6 | import features as f 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def index_to_config(args, index): 12 | bias = index % 2 13 | index //= 2 14 | 15 | bidirectional = index % 2 16 | index //= 2 17 | 18 | batch = (index % args.batches) + 1 19 | index //= args.batches 20 | 21 | seq_len = (index % args.seq_len) + 1 22 | index //= args.seq_len 23 | 24 | input_size = (index % args.input_size) + 1 25 | index //= args.input_size 26 | 27 | hidden_size = (index % args.hidden_size) + 1 28 | index //= args.hidden_size 29 | 30 | num_layers = index + 1 31 | 32 | return ( 33 | bias, 34 | bidirectional, 35 | batch, 36 | seq_len, 37 | input_size, 38 | hidden_size, 39 | num_layers, 40 | ) 41 | 42 | 43 | def config_to_profiler_args(config): 44 | (bias, 45 | bidirectional, 46 | batch, 47 | seq_len, 48 | input_size, 49 | hidden_size, 50 | num_layers) = config 51 | inputs = torch.randn((seq_len, batch, input_size)).cuda() 52 | lstm = torch.nn.LSTM( 53 | input_size=input_size, 54 | hidden_size=hidden_size, 55 | num_layers=num_layers, 56 | bias=bool(bias), 57 | bidirectional=bool(bidirectional), 58 | ).cuda() 59 | # NOTE: This is important: for most LSTMs, we will also need the gradient 60 | # with respect to the input to be able to backpropagate to earlier 61 | # operations in the network. 62 | inputs = inputs.requires_grad_() 63 | return { 64 | 'func': lstm, 65 | 'args': (inputs,), 66 | 'kwargs': {}, 67 | } 68 | 69 | 70 | def main(): 71 | measurer = Measurer( 72 | op_name='lstm', 73 | recorder_config=f.lstm, 74 | index_to_config=index_to_config, 75 | config_to_profiler_args=config_to_profiler_args, 76 | ) 77 | parser = argparse.ArgumentParser() 78 | measurer.add_args(parser) 79 | parser.add_argument('--batches', type=int, default=128) 80 | parser.add_argument('--seq-len', type=int, default=64) 81 | parser.add_argument('--input-size', type=int, default=1280) 82 | parser.add_argument('--hidden-size', type=int, default=1280) 83 | parser.add_argument('--num-layers', type=int, default=6) 84 | args = parser.parse_args() 85 | 86 | num_configs = ( 87 | 2 * # bias 88 | 2 * # bidirectional 89 | args.batches * 90 | args.seq_len * 91 | args.input_size * 92 | args.hidden_size * 93 | args.num_layers 94 | ) 95 | measurer.measure_configurations(args, num_configs) 96 | 97 | 98 | if __name__ == '__main__': 99 | kwargs = { 100 | "format": "%(asctime)s %(levelname)-8s %(message)s", 101 | "datefmt": "%Y-%m-%d %H:%M", 102 | "level": logging.INFO, 103 | } 104 | logging.basicConfig(**kwargs) 105 | main() 106 | --------------------------------------------------------------------------------