├── .github ├── CODEOWNERS └── workflows │ └── cla.yml ├── .gitignore ├── .gitmodules ├── CONTRIBUTING.md ├── LICENSE.md ├── README.md ├── benchmark_readme_template.md ├── graph_neural_network ├── Dockerfile ├── Dockerfile.h100 ├── README.md ├── build_partition_feature.py ├── compress_graph.py ├── dataset.py ├── dist_train_rgnn.py ├── download.py ├── download_igbh_full.sh ├── mlperf_logging_utils.py ├── partition.py ├── rgnn.py ├── split_seeds.py ├── train_rgnn_multi_gpu.py └── utilities.py ├── install_cuda_docker.sh ├── language_model └── tensorflow │ └── bert │ ├── README.md │ ├── __init__.py │ ├── checkpoint_add_gradacc.py │ ├── cleanup_scripts │ ├── clean.sh │ ├── cleanup_file.py │ ├── create_pretraining_data.py │ ├── do_gather.py │ ├── do_sentence_segmentation.py │ ├── download_and_uncompress.sh │ ├── eval.md5 │ ├── pick_eval_samples.py │ ├── process_wiki.sh │ ├── sample_data │ │ ├── wiki_00 │ │ ├── wiki_01 │ │ ├── wiki_02 │ │ └── wiki_03 │ ├── seperate_test_set.py │ └── tokenization.py │ ├── dataset.md │ ├── deferred_grad_optimizer.py │ ├── distribution_utils.py │ ├── extract_features.py │ ├── lamb_optimizer_v1.py │ ├── mlp_logging.py │ ├── modeling.py │ ├── optimization.py │ ├── run_pretraining.py │ └── tpu_lib.py ├── large_language_model_pretraining └── nemo │ ├── Dockerfile │ ├── README.md │ ├── callbacks.py │ ├── config.sh │ ├── mcore.patch │ ├── pretrain_llama31.py │ ├── run_llama31.sh │ └── utils │ ├── consolidate_data.sh │ ├── launch_nemo_convert.sh │ ├── nemo_convert.py │ └── preprocess.sh ├── llama2_70b_lora ├── Dockerfile ├── README.md ├── configs │ └── default_config.yaml ├── convergence_example.txt ├── requirements.txt ├── run_docker.sh ├── run_llama_70B_scrolls_r16.sh └── scripts │ ├── mlperf_logging_utils.py │ ├── train.py │ └── utils.py ├── recommendation_v2 └── torchrec_dlrm │ ├── Dockerfile │ ├── README.MD │ ├── __init__.py │ ├── aws_component.py │ ├── data │ ├── __init__.py │ ├── dlrm_dataloader.py │ └── multi_hot_criteo.py │ ├── dlrm_main.py │ ├── lr_scheduler.py │ ├── md5sums_MLPerf_v2_synthetic_multi_hot_sparse_dataset.txt │ ├── md5sums_preprocessed_criteo_click_logs_dataset.txt │ ├── mlperf_logging_utils.py │ ├── multi_hot.py │ ├── requirements.txt │ ├── scripts │ ├── materialize_synthetic_multihot_dataset.py │ └── process_Criteo_1TB_Click_Logs_dataset.sh │ └── tests │ └── test_dlrm_main.py ├── reference_results.md ├── retired_benchmarks ├── dlrm │ ├── download_dataset.sh │ └── verify_dataset.sh ├── gnmt │ ├── .dockerignore │ ├── .gitignore │ ├── README.md │ ├── download_dataset.sh │ ├── pytorch │ │ ├── Dockerfile │ │ ├── LICENSE │ │ ├── README.md │ │ ├── requirements.txt │ │ ├── run.sh │ │ ├── run_and_time.sh │ │ ├── scripts │ │ │ ├── docker │ │ │ │ ├── build.sh │ │ │ │ └── interactive.sh │ │ │ └── filter_dataset.py │ │ ├── seq2seq │ │ │ ├── data │ │ │ │ ├── config.py │ │ │ │ ├── dataset.py │ │ │ │ ├── sampler.py │ │ │ │ └── tokenizer.py │ │ │ ├── inference │ │ │ │ ├── beam_search.py │ │ │ │ └── inference.py │ │ │ ├── models │ │ │ │ ├── attention.py │ │ │ │ ├── decoder.py │ │ │ │ ├── encoder.py │ │ │ │ ├── gnmt.py │ │ │ │ └── seq2seq_base.py │ │ │ ├── train │ │ │ │ ├── fp_optimizers.py │ │ │ │ ├── lr_scheduler.py │ │ │ │ ├── smoothing.py │ │ │ │ └── trainer.py │ │ │ └── utils.py │ │ ├── train.py │ │ └── translate.py │ └── verify_dataset.sh ├── gpt3 │ ├── megatron-lm │ │ ├── .gitignore │ │ ├── Dockerfile │ │ ├── LICENSE │ │ ├── README.md │ │ ├── checksums │ │ │ ├── additional_checkpoint_files │ │ │ │ ├── common.pt │ │ │ │ └── metadata.json │ │ │ ├── dataset_checksum.log │ │ │ └── fp32_checkpoint_checksum.log │ │ ├── megatron │ │ │ ├── __init__.py │ │ │ ├── arguments.py │ │ │ ├── checkpointing.py │ │ │ ├── core │ │ │ │ └── dist_checkpointing │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── core.py │ │ │ │ │ ├── dict_utils.py │ │ │ │ │ ├── mapping.py │ │ │ │ │ ├── optimizer.py │ │ │ │ │ ├── serialization.py │ │ │ │ │ ├── strategies │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ └── zarr.py │ │ │ │ │ ├── tests │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── common.py │ │ │ │ │ ├── test_correctness.py │ │ │ │ │ └── test_load_check.py │ │ │ │ │ └── utils.py │ │ │ ├── data │ │ │ │ ├── Makefile │ │ │ │ ├── __init__.py │ │ │ │ ├── autoaugment.py │ │ │ │ ├── biencoder_dataset_utils.py │ │ │ │ ├── blendable_dataset.py │ │ │ │ ├── data_samplers.py │ │ │ │ ├── dataset_utils.py │ │ │ │ ├── gpt_dataset.py │ │ │ │ ├── helpers.cpp │ │ │ │ ├── ict_dataset.py │ │ │ │ ├── indexed_dataset.py │ │ │ │ ├── orqa_wiki_dataset.py │ │ │ │ ├── realm_dataset_utils.py │ │ │ │ ├── realm_index.py │ │ │ │ └── test │ │ │ │ │ ├── test_indexed_dataset.py │ │ │ │ │ └── test_preprocess_data.sh │ │ │ ├── dist_signal_handler.py │ │ │ ├── fp16_deprecated │ │ │ │ └── loss_scaler.py │ │ │ ├── fused_kernels │ │ │ │ ├── __init__.py │ │ │ │ ├── compat.h │ │ │ │ ├── fused_weight_gradient_dense.cpp │ │ │ │ ├── fused_weight_gradient_dense.cu │ │ │ │ ├── layer_norm_cuda.cpp │ │ │ │ ├── layer_norm_cuda_kernel.cu │ │ │ │ ├── scaled_masked_softmax.cpp │ │ │ │ ├── scaled_masked_softmax.h │ │ │ │ ├── scaled_masked_softmax_cuda.cu │ │ │ │ ├── scaled_softmax.cpp │ │ │ │ ├── scaled_softmax_cuda.cu │ │ │ │ ├── scaled_upper_triang_masked_softmax.cpp │ │ │ │ ├── scaled_upper_triang_masked_softmax.h │ │ │ │ ├── scaled_upper_triang_masked_softmax_cuda.cu │ │ │ │ ├── tests │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_fused_kernels.py │ │ │ │ └── type_shim.h │ │ │ ├── global_vars.py │ │ │ ├── indexer.py │ │ │ ├── initialize.py │ │ │ ├── learning_rates.py │ │ │ ├── memory.py │ │ │ ├── microbatches.py │ │ │ ├── model │ │ │ │ ├── __init__.py │ │ │ │ ├── biencoder_model.py │ │ │ │ ├── classification.py │ │ │ │ ├── distributed.py │ │ │ │ ├── enums.py │ │ │ │ ├── fused_bias_gelu.py │ │ │ │ ├── fused_layer_norm.py │ │ │ │ ├── fused_softmax.py │ │ │ │ ├── gpt_model.py │ │ │ │ ├── language_model.py │ │ │ │ ├── module.py │ │ │ │ ├── multiple_choice.py │ │ │ │ ├── realm_model.py │ │ │ │ ├── transformer.py │ │ │ │ └── utils.py │ │ │ ├── mpu │ │ │ │ ├── __init__.py │ │ │ │ ├── cross_entropy.py │ │ │ │ ├── data.py │ │ │ │ ├── initialize.py │ │ │ │ ├── layers.py │ │ │ │ ├── mappings.py │ │ │ │ ├── random.py │ │ │ │ ├── tests │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── commons.py │ │ │ │ │ ├── test_cross_entropy.py │ │ │ │ │ ├── test_data.py │ │ │ │ │ ├── test_initialize.py │ │ │ │ │ ├── test_layers.py │ │ │ │ │ └── test_random.py │ │ │ │ └── utils.py │ │ │ ├── optimizer │ │ │ │ ├── __init__.py │ │ │ │ ├── clip_grads.py │ │ │ │ ├── distrib_optimizer.py │ │ │ │ ├── grad_scaler.py │ │ │ │ └── optimizer.py │ │ │ ├── optimizer_param_scheduler.py │ │ │ ├── p2p_communication.py │ │ │ ├── schedules.py │ │ │ ├── static │ │ │ │ └── index.html │ │ │ ├── text_generation │ │ │ │ ├── __init__.py │ │ │ │ ├── api.py │ │ │ │ ├── beam_utils.py │ │ │ │ ├── communication.py │ │ │ │ ├── forward_step.py │ │ │ │ ├── generation.py │ │ │ │ ├── sampling.py │ │ │ │ └── tokenization.py │ │ │ ├── text_generation_server.py │ │ │ ├── timers.py │ │ │ ├── tokenizer │ │ │ │ ├── __init__.py │ │ │ │ ├── bert_tokenization.py │ │ │ │ ├── gpt2_tokenization.py │ │ │ │ └── tokenizer.py │ │ │ ├── training.py │ │ │ └── utils.py │ │ ├── pretrain_gpt.py │ │ ├── requirements.txt │ │ ├── run_gpt3.sh │ │ ├── scripts │ │ │ ├── common_bf16.json │ │ │ ├── common_fp32.json │ │ │ ├── convert_paxml_to_megatron_distributed.py │ │ │ ├── json_to_torch.py │ │ │ ├── load_checkpoint.md │ │ │ ├── load_checkpoint.py │ │ │ ├── preprocess.sh │ │ │ └── preprocess_val.sh │ │ ├── tasks │ │ │ ├── data_utils.py │ │ │ ├── ensemble_classifier.py │ │ │ ├── eval_utils.py │ │ │ ├── finetune_utils.py │ │ │ ├── glue │ │ │ │ ├── data.py │ │ │ │ ├── finetune.py │ │ │ │ ├── mnli.py │ │ │ │ └── qqp.py │ │ │ ├── main.py │ │ │ ├── orqa │ │ │ │ ├── README.md │ │ │ │ ├── evaluate_orqa.py │ │ │ │ ├── evaluate_utils.py │ │ │ │ ├── supervised │ │ │ │ │ ├── data.py │ │ │ │ │ ├── eval_utils.py │ │ │ │ │ └── finetune.py │ │ │ │ └── unsupervised │ │ │ │ │ ├── nq.py │ │ │ │ │ ├── qa_utils.py │ │ │ │ │ └── tokenizers.py │ │ │ ├── race │ │ │ │ ├── data.py │ │ │ │ └── finetune.py │ │ │ ├── vision │ │ │ │ ├── classification.py │ │ │ │ ├── eval_utils.py │ │ │ │ ├── finetune_utils.py │ │ │ │ └── main.py │ │ │ └── zeroshot_gpt │ │ │ │ ├── datasets.py │ │ │ │ ├── detokenizer.py │ │ │ │ └── evaluate.py │ │ └── tools │ │ │ └── preprocess_data.py │ └── paxml │ │ ├── README.md │ │ ├── c4.py │ │ ├── c4_mllog.py │ │ ├── lm_cloud.py │ │ ├── model_params.py │ │ └── utils │ │ ├── generate_spm.md │ │ ├── load_ts_ckpt.md │ │ ├── load_ts_ckpt.py │ │ ├── select_example.md │ │ ├── select_example.py │ │ └── select_text.py ├── maskrcnn │ ├── Dockerfile │ ├── README.md │ ├── download_dataset.sh │ ├── pytorch │ │ ├── .flake8 │ │ ├── .github │ │ │ └── ISSUE_TEMPLATE │ │ │ │ ├── bug-report.md │ │ │ │ ├── feature-request.md │ │ │ │ └── questions-help-support.md │ │ ├── .gitignore │ │ ├── ABSTRACTIONS.md │ │ ├── CODE_OF_CONDUCT.md │ │ ├── CONTRIBUTING.md │ │ ├── INSTALL.md │ │ ├── LICENSE │ │ ├── MODEL_ZOO.md │ │ ├── README.md │ │ ├── TROUBLESHOOTING.md │ │ ├── configs │ │ │ └── e2e_mask_rcnn_R_50_FPN_1x.yaml │ │ ├── demo │ │ │ ├── Mask_R-CNN_demo.ipynb │ │ │ ├── README.md │ │ │ ├── demo_e2e_mask_rcnn_R_50_FPN_1x.png │ │ │ ├── demo_e2e_mask_rcnn_X_101_32x8d_FPN_1x.png │ │ │ ├── predictor.py │ │ │ └── webcam.py │ │ ├── docker │ │ │ ├── Dockerfile │ │ │ └── docker-jupyter │ │ │ │ ├── Dockerfile │ │ │ │ └── jupyter_notebook_config.py │ │ ├── maskrcnn_benchmark │ │ │ ├── __init__.py │ │ │ ├── config │ │ │ │ ├── __init__.py │ │ │ │ ├── defaults.py │ │ │ │ └── paths_catalog.py │ │ │ ├── csrc │ │ │ │ ├── ROIAlign.h │ │ │ │ ├── ROIPool.h │ │ │ │ ├── SigmoidFocalLoss.h │ │ │ │ ├── cpu │ │ │ │ │ ├── ROIAlign_cpu.cpp │ │ │ │ │ ├── nms_cpu.cpp │ │ │ │ │ └── vision.h │ │ │ │ ├── cuda │ │ │ │ │ ├── ROIAlign_cuda.cu │ │ │ │ │ ├── ROIPool_cuda.cu │ │ │ │ │ ├── SigmoidFocalLoss_cuda.cu │ │ │ │ │ ├── nms.cu │ │ │ │ │ └── vision.h │ │ │ │ ├── nms.h │ │ │ │ └── vision.cpp │ │ │ ├── data │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── build.py │ │ │ │ ├── collate_batch.py │ │ │ │ ├── datasets │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── coco.py │ │ │ │ │ ├── concat_dataset.py │ │ │ │ │ ├── evaluation │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── coco │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ └── coco_eval.py │ │ │ │ │ │ └── voc │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ └── voc_eval.py │ │ │ │ │ ├── list_dataset.py │ │ │ │ │ └── voc.py │ │ │ │ ├── samplers │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── distributed.py │ │ │ │ │ ├── grouped_batch_sampler.py │ │ │ │ │ └── iteration_based_batch_sampler.py │ │ │ │ └── transforms │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── build.py │ │ │ │ │ └── transforms.py │ │ │ ├── engine │ │ │ │ ├── __init__.py │ │ │ │ ├── inference.py │ │ │ │ ├── tester.py │ │ │ │ └── trainer.py │ │ │ ├── layers │ │ │ │ ├── __init__.py │ │ │ │ ├── _utils.py │ │ │ │ ├── batch_norm.py │ │ │ │ ├── misc.py │ │ │ │ ├── nms.py │ │ │ │ ├── roi_align.py │ │ │ │ ├── roi_pool.py │ │ │ │ ├── sigmoid_focal_loss.py │ │ │ │ └── smooth_l1_loss.py │ │ │ ├── modeling │ │ │ │ ├── __init__.py │ │ │ │ ├── backbone │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── backbone.py │ │ │ │ │ ├── fpn.py │ │ │ │ │ └── resnet.py │ │ │ │ ├── balanced_positive_negative_sampler.py │ │ │ │ ├── box_coder.py │ │ │ │ ├── detector │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── detectors.py │ │ │ │ │ └── generalized_rcnn.py │ │ │ │ ├── make_layers.py │ │ │ │ ├── matcher.py │ │ │ │ ├── poolers.py │ │ │ │ ├── registry.py │ │ │ │ ├── roi_heads │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── box_head │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── box_head.py │ │ │ │ │ │ ├── inference.py │ │ │ │ │ │ ├── loss.py │ │ │ │ │ │ ├── roi_box_feature_extractors.py │ │ │ │ │ │ └── roi_box_predictors.py │ │ │ │ │ ├── keypoint_head │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── inference.py │ │ │ │ │ │ ├── keypoint_head.py │ │ │ │ │ │ ├── loss.py │ │ │ │ │ │ ├── roi_keypoint_feature_extractors.py │ │ │ │ │ │ └── roi_keypoint_predictors.py │ │ │ │ │ ├── mask_head │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── inference.py │ │ │ │ │ │ ├── loss.py │ │ │ │ │ │ ├── mask_head.py │ │ │ │ │ │ ├── roi_mask_feature_extractors.py │ │ │ │ │ │ └── roi_mask_predictors.py │ │ │ │ │ └── roi_heads.py │ │ │ │ ├── rpn │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── anchor_generator.py │ │ │ │ │ ├── inference.py │ │ │ │ │ ├── loss.py │ │ │ │ │ ├── retinanet │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── inference.py │ │ │ │ │ │ ├── loss.py │ │ │ │ │ │ └── retinanet.py │ │ │ │ │ ├── rpn.py │ │ │ │ │ └── utils.py │ │ │ │ └── utils.py │ │ │ ├── solver │ │ │ │ ├── __init__.py │ │ │ │ ├── build.py │ │ │ │ └── lr_scheduler.py │ │ │ ├── structures │ │ │ │ ├── __init__.py │ │ │ │ ├── bounding_box.py │ │ │ │ ├── boxlist_ops.py │ │ │ │ ├── image_list.py │ │ │ │ ├── keypoint.py │ │ │ │ └── segmentation_mask.py │ │ │ └── utils │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── c2_model_loading.py │ │ │ │ ├── checkpoint.py │ │ │ │ ├── collect_env.py │ │ │ │ ├── comm.py │ │ │ │ ├── cv2_util.py │ │ │ │ ├── env.py │ │ │ │ ├── imports.py │ │ │ │ ├── logger.py │ │ │ │ ├── metric_logger.py │ │ │ │ ├── miscellaneous.py │ │ │ │ ├── mlperf_logger.py │ │ │ │ ├── model_serialization.py │ │ │ │ ├── model_zoo.py │ │ │ │ └── registry.py │ │ ├── setup.py │ │ ├── tests │ │ │ ├── checkpoint.py │ │ │ ├── test_data_samplers.py │ │ │ └── test_metric_logger.py │ │ └── tools │ │ │ ├── cityscapes │ │ │ ├── convert_cityscapes_to_coco.py │ │ │ └── instances2dict_with_polygons.py │ │ │ ├── test_net.py │ │ │ ├── train_mlperf.py │ │ │ └── train_net.py │ └── run_and_time.sh ├── minigo │ ├── README.md │ └── tensorflow │ │ ├── Dockerfile │ │ ├── minigo │ │ ├── .bazelrc │ │ ├── .gitignore │ │ ├── .pylintrc │ │ ├── LICENSE │ │ ├── README.md │ │ ├── RESULTS.md │ │ ├── WORKSPACE │ │ ├── __init__.py │ │ ├── batch_exporter.py │ │ ├── bigtable_input.py │ │ ├── bigtable_output.py │ │ ├── bootstrap.py │ │ ├── cc │ │ │ ├── .clang-format │ │ │ ├── BUILD │ │ │ ├── CPPLINT.cfg │ │ │ ├── README.md │ │ │ ├── algorithm.cc │ │ │ ├── algorithm.h │ │ │ ├── algorithm_test.cc │ │ │ ├── async │ │ │ │ ├── BUILD │ │ │ │ ├── poll_thread.cc │ │ │ │ ├── poll_thread.h │ │ │ │ ├── semaphore.h │ │ │ │ ├── sharded_executor.cc │ │ │ │ ├── sharded_executor.h │ │ │ │ ├── thread.cc │ │ │ │ ├── thread.h │ │ │ │ ├── thread_safe_queue.h │ │ │ │ └── thread_safe_queue_test.cc │ │ │ ├── benchmark.BUILD │ │ │ ├── color.cc │ │ │ ├── color.h │ │ │ ├── concurrent_selfplay.cc │ │ │ ├── config │ │ │ │ ├── BUILD │ │ │ │ └── minigo.bzl │ │ │ ├── configure_tensorflow.sh │ │ │ ├── constants.h │ │ │ ├── coord.cc │ │ │ ├── coord.h │ │ │ ├── coord_test.cc │ │ │ ├── cuda_configure.bzl │ │ │ ├── dual_net │ │ │ │ ├── BUILD │ │ │ │ ├── batching_dual_net.cc │ │ │ │ ├── batching_dual_net.h │ │ │ │ ├── batching_dual_net_test.cc │ │ │ │ ├── dual_net.cc │ │ │ │ ├── dual_net.h │ │ │ │ ├── dual_net_test.cc │ │ │ │ ├── factory.cc │ │ │ │ ├── factory.h │ │ │ │ ├── fake_dual_net.cc │ │ │ │ ├── fake_dual_net.h │ │ │ │ ├── inference_cache.cc │ │ │ │ ├── inference_cache.h │ │ │ │ ├── inference_cache_test.cc │ │ │ │ ├── lite_dual_net.cc │ │ │ │ ├── lite_dual_net.h │ │ │ │ ├── random_dual_net.cc │ │ │ │ ├── random_dual_net.h │ │ │ │ ├── reloading_dual_net.cc │ │ │ │ ├── reloading_dual_net.h │ │ │ │ ├── reloading_dual_net_test.cc │ │ │ │ ├── test_lite.minigo │ │ │ │ ├── test_model.pb │ │ │ │ ├── test_model.tflite │ │ │ │ ├── test_model.uff │ │ │ │ ├── test_tf.minigo │ │ │ │ ├── tf_dual_net.cc │ │ │ │ ├── tf_dual_net.h │ │ │ │ ├── tpu_dual_net.cc │ │ │ │ ├── tpu_dual_net.h │ │ │ │ ├── trt_dual_net.cc │ │ │ │ └── trt_dual_net.h │ │ │ ├── eval.cc │ │ │ ├── file │ │ │ │ ├── BUILD │ │ │ │ ├── directory_watcher.cc │ │ │ │ ├── directory_watcher.h │ │ │ │ ├── path.cc │ │ │ │ ├── path.h │ │ │ │ ├── path_test.cc │ │ │ │ ├── utils.h │ │ │ │ ├── utils_posix.cc │ │ │ │ ├── utils_test.cc │ │ │ │ ├── utils_tf.cc │ │ │ │ └── utils_windows.cc │ │ │ ├── game.cc │ │ │ ├── game.h │ │ │ ├── game_utils.cc │ │ │ ├── game_utils.h │ │ │ ├── group.cc │ │ │ ├── group.h │ │ │ ├── gtp.cc │ │ │ ├── gtp_client.cc │ │ │ ├── gtp_client.h │ │ │ ├── gtp_player.cc │ │ │ ├── gtp_player.h │ │ │ ├── init.cc │ │ │ ├── init.h │ │ │ ├── inline_vector.h │ │ │ ├── json.BUILD │ │ │ ├── logging.cc │ │ │ ├── logging.h │ │ │ ├── mcts_node.cc │ │ │ ├── mcts_node.h │ │ │ ├── mcts_node_test.cc │ │ │ ├── mcts_player.cc │ │ │ ├── mcts_player.h │ │ │ ├── mcts_player_test.cc │ │ │ ├── mcts_tree.cc │ │ │ ├── mcts_tree.h │ │ │ ├── mcts_tree_test.cc │ │ │ ├── minigui_gtp_client.cc │ │ │ ├── minigui_gtp_client.h │ │ │ ├── minigui_player.cc │ │ │ ├── minigui_player.h │ │ │ ├── model │ │ │ │ ├── BUILD │ │ │ │ ├── batching_model.cc │ │ │ │ ├── batching_model.h │ │ │ │ ├── batching_model_test.cc │ │ │ │ ├── buffered_model.cc │ │ │ │ ├── buffered_model.h │ │ │ │ ├── factory.cc │ │ │ │ ├── factory.h │ │ │ │ ├── features.cc │ │ │ │ ├── features.h │ │ │ │ ├── features_benchmark.cc │ │ │ │ ├── features_internal.h │ │ │ │ ├── features_test.cc │ │ │ │ ├── inference_cache.cc │ │ │ │ ├── inference_cache.h │ │ │ │ ├── inference_cache_test.cc │ │ │ │ ├── loader.cc │ │ │ │ ├── loader.h │ │ │ │ ├── model.cc │ │ │ │ ├── model.h │ │ │ │ ├── types.cc │ │ │ │ ├── types.h │ │ │ │ └── types_test.cc │ │ │ ├── move.cc │ │ │ ├── move.h │ │ │ ├── padded_array.h │ │ │ ├── pass_alive_test.cc │ │ │ ├── platform │ │ │ │ ├── BUILD │ │ │ │ ├── utils.h │ │ │ │ ├── utils_linux.cc │ │ │ │ ├── utils_osx.cc │ │ │ │ └── utils_windows.cc │ │ │ ├── position.cc │ │ │ ├── position.h │ │ │ ├── position_benchmark.cc │ │ │ ├── position_test.cc │ │ │ ├── puzzle.cc │ │ │ ├── random.cc │ │ │ ├── random.h │ │ │ ├── random_test.cc │ │ │ ├── replay_games.cc │ │ │ ├── sample_records.cc │ │ │ ├── selfplay.cc │ │ │ ├── sgf.cc │ │ │ ├── sgf.h │ │ │ ├── sgf_test.cc │ │ │ ├── simple_example.cc │ │ │ ├── stone.h │ │ │ ├── symmetries.cc │ │ │ ├── symmetries.h │ │ │ ├── symmetries_test.cc │ │ │ ├── tensorflow │ │ │ │ ├── BUILD │ │ │ │ └── copy_outputs.sh │ │ │ ├── tensorrt_configure.bzl │ │ │ ├── test.sh │ │ │ ├── test_utils.cc │ │ │ ├── test_utils.h │ │ │ ├── tf_bt_utils.cc │ │ │ ├── tf_bt_utils_dummy.cc │ │ │ ├── tf_utils.cc │ │ │ ├── tf_utils.h │ │ │ ├── tf_utils_dummy.cc │ │ │ ├── tfrzz_to_cbt.cc │ │ │ ├── thread_safe_queue.h │ │ │ ├── thread_safe_queue_test.cc │ │ │ ├── tiny_set.h │ │ │ ├── tpu_test.cc │ │ │ ├── wtf.BUILD │ │ │ ├── wtf_saver.cc │ │ │ ├── wtf_saver.h │ │ │ ├── zobrist.cc │ │ │ └── zobrist.h │ │ ├── cloud_logging.py │ │ ├── cluster │ │ │ ├── .gitignore │ │ │ ├── README.md │ │ │ ├── base │ │ │ │ ├── Dockerfile │ │ │ │ └── Makefile │ │ │ ├── calibrator │ │ │ │ ├── Dockerfile │ │ │ │ ├── Makefile │ │ │ │ └── calibrator-deployment.yaml │ │ │ ├── cgos │ │ │ │ ├── Dockerfile │ │ │ │ ├── Makefile │ │ │ │ └── cgos-player.yaml │ │ │ ├── cluster-down.sh │ │ │ ├── cluster-up-cpu.sh │ │ │ ├── cluster-up-gpu-large.sh │ │ │ ├── cluster-up-gpu.sh │ │ │ ├── cluster-up-simple.sh │ │ │ ├── cluster-up-tpu.sh │ │ │ ├── common.sh │ │ │ ├── create_table.sh │ │ │ ├── destroy.sh │ │ │ ├── eval_server │ │ │ │ ├── README.md │ │ │ │ ├── add_model.py │ │ │ │ └── launch_eval.py │ │ │ ├── evaluator │ │ │ │ ├── Dockerfile-cc │ │ │ │ ├── Dockerfile-py │ │ │ │ ├── Dockerfile-ringmaster │ │ │ │ ├── Makefile │ │ │ │ ├── cc-evaluator.yaml │ │ │ │ ├── deploy-cc-evaluator.sh │ │ │ │ ├── evaluator_cc_wrapper.sh │ │ │ │ ├── evaluator_py_wrapper.sh │ │ │ │ ├── evaluator_ringmaster_wrapper.py │ │ │ │ ├── evaluator_ringmaster_wrapper.sh │ │ │ │ ├── gpu-evaluator.yaml │ │ │ │ └── launch_eval.py │ │ │ ├── init-credentials.sh │ │ │ ├── make-all.sh │ │ │ ├── minigui │ │ │ │ ├── Dockerfile │ │ │ │ ├── Makefile │ │ │ │ ├── minigui-pod.yaml │ │ │ │ ├── run-local.sh │ │ │ │ └── simple-service.yaml │ │ │ ├── ringmaster │ │ │ │ ├── Makefile │ │ │ │ ├── example.ctl │ │ │ │ ├── lz-Dockerfile │ │ │ │ ├── mggtp-Dockerfile │ │ │ │ ├── p100-lz-tuning │ │ │ │ ├── ringmaster.yaml │ │ │ │ ├── ringmaster_wrapper.sh │ │ │ │ └── setup_ringmaster.py │ │ │ ├── selfplay │ │ │ │ ├── Dockerfile-cc │ │ │ │ ├── Dockerfile-py │ │ │ │ ├── Dockerfile-tpu │ │ │ │ ├── Makefile │ │ │ │ ├── README.md │ │ │ │ ├── cc-player.yaml │ │ │ │ ├── deploy-cc-player.sh │ │ │ │ ├── deploy-cpu-player.sh │ │ │ │ ├── deploy-gpu-player.sh │ │ │ │ ├── gpu-player.yaml │ │ │ │ ├── gpu-provision-daemonset.yaml │ │ │ │ ├── launch-tpu-deployment.sh │ │ │ │ ├── tpu-player-deployment-nr.yaml │ │ │ │ └── tpu-player-deployment.yaml │ │ │ ├── trainer │ │ │ │ ├── Dockerfile │ │ │ │ ├── Makefile │ │ │ │ ├── deploy-trainer.sh │ │ │ │ └── tpu-trainer-deployment.yaml │ │ │ ├── unset-common.sh │ │ │ ├── utils.sh │ │ │ └── var-status.sh │ │ ├── coords.py │ │ ├── dual_net.py │ │ ├── dual_net_edge_tpu.py │ │ ├── evaluate.py │ │ ├── features.py │ │ ├── freeze_graph.py │ │ ├── go.py │ │ ├── gtp.py │ │ ├── gtp_cmd_handlers.py │ │ ├── gtp_engine.py │ │ ├── mask_flags.py │ │ ├── mcts.py │ │ ├── minigo_model.py │ │ ├── minigui │ │ │ ├── README.md │ │ │ ├── app.ts │ │ │ ├── base.ts │ │ │ ├── board.ts │ │ │ ├── control │ │ │ │ ├── leelaz.ctl │ │ │ │ ├── minigo_edgetpu.ctl │ │ │ │ ├── minigo_py.ctl │ │ │ │ ├── minigo_tf.ctl │ │ │ │ └── vs.ctl │ │ │ ├── demo.ts │ │ │ ├── edgetpu │ │ │ │ ├── install_requirements.sh │ │ │ │ └── start_chromium.sh │ │ │ ├── fetch-and-run.sh │ │ │ ├── graph.ts │ │ │ ├── gtp_socket.ts │ │ │ ├── kiosk.ts │ │ │ ├── layer.ts │ │ │ ├── log.ts │ │ │ ├── minigui-common.sh │ │ │ ├── position.ts │ │ │ ├── requirements.txt │ │ │ ├── serve.py │ │ │ ├── static │ │ │ │ ├── app.js │ │ │ │ ├── base.js │ │ │ │ ├── board.js │ │ │ │ ├── demo.html │ │ │ │ ├── demo.js │ │ │ │ ├── graph.js │ │ │ │ ├── gtp_socket.js │ │ │ │ ├── heat_map.js │ │ │ │ ├── index.html │ │ │ │ ├── kiosk.html │ │ │ │ ├── kiosk.js │ │ │ │ ├── layer.js │ │ │ │ ├── log.js │ │ │ │ ├── lw_demo.html │ │ │ │ ├── position.js │ │ │ │ ├── require │ │ │ │ │ ├── LICENSE │ │ │ │ │ └── require.js │ │ │ │ ├── socketio │ │ │ │ │ ├── LICENSE │ │ │ │ │ └── socket.io.min.js │ │ │ │ ├── study.html │ │ │ │ ├── study.js │ │ │ │ ├── style.css │ │ │ │ ├── util.js │ │ │ │ ├── variation_tree.js │ │ │ │ ├── view.js │ │ │ │ ├── vs.html │ │ │ │ ├── vs.js │ │ │ │ └── winrate_graph.js │ │ │ ├── study.ts │ │ │ ├── unset-minigui-common.sh │ │ │ ├── util.ts │ │ │ ├── variation_tree.ts │ │ │ ├── view.ts │ │ │ ├── vs.ts │ │ │ └── winrate_graph.ts │ │ ├── ml_perf │ │ │ ├── .gitignore │ │ │ ├── README.md │ │ │ ├── eval_models.py │ │ │ ├── flags │ │ │ │ ├── 9 │ │ │ │ │ ├── architecture.flags │ │ │ │ │ ├── bootstrap.flags │ │ │ │ │ ├── eval.flags │ │ │ │ │ ├── rl_loop.flags │ │ │ │ │ ├── selfplay.flags │ │ │ │ │ ├── train.flags │ │ │ │ │ ├── train_loop.flags │ │ │ │ │ └── validate.flags │ │ │ │ └── 19 │ │ │ │ │ ├── architecture.flags │ │ │ │ │ ├── bootstrap.flags │ │ │ │ │ ├── eval.flags │ │ │ │ │ ├── rl_loop.flags │ │ │ │ │ ├── selfplay.flags │ │ │ │ │ ├── train.flags │ │ │ │ │ ├── train_loop.flags │ │ │ │ │ └── validate.flags │ │ │ ├── get_data.py │ │ │ ├── init_from_checkpoint.py │ │ │ ├── make_checkpoint.py │ │ │ ├── reference_implementation.py │ │ │ ├── repeat_run.sh │ │ │ ├── scripts │ │ │ │ ├── bootstrap.sh │ │ │ │ ├── common.sh │ │ │ │ ├── init_from_checkpoint.sh │ │ │ │ ├── make_checkpoint.sh │ │ │ │ ├── start_selfplay.sh │ │ │ │ ├── stop_selfplay.sh │ │ │ │ └── train.sh │ │ │ ├── train_loop.py │ │ │ └── utils.py │ │ ├── notes.txt │ │ ├── oneoffs │ │ │ ├── __init__.py │ │ │ ├── bigquery_games_schema.json │ │ │ ├── bigquery_moves_schema.json │ │ │ ├── cbt_eval_sgfs.py │ │ │ ├── cbt_models.py │ │ │ ├── compare_examples.py │ │ │ ├── distillation.py │ │ │ ├── dump_game.py │ │ │ ├── embeddings.py │ │ │ ├── embeddings_graphs.py │ │ │ ├── eval_sgf_to_cbt.py │ │ │ ├── generate_tpu_graph_def.py │ │ │ ├── get_tpu_address.py │ │ │ ├── heatmap.py │ │ │ ├── inspect_examples.py │ │ │ ├── l2_cost_by_var.py │ │ │ ├── ladder_detector.py │ │ │ ├── launch_tensorboard.py │ │ │ ├── modelstats.sh │ │ │ ├── oneoff_utils.py │ │ │ ├── position_pv.py │ │ │ ├── prepare_bigquery.py │ │ │ ├── resign_analysis.py │ │ │ ├── retrain.sh │ │ │ ├── rotate_examples.py │ │ │ ├── sharp_positions.py │ │ │ ├── swa.py │ │ │ ├── symmetry_analysis.py │ │ │ ├── training_curve.py │ │ │ ├── unwrap_model.py │ │ │ ├── validate_misc.py │ │ │ └── wrap_model.py │ │ ├── player_interface.py │ │ ├── preprocessing.py │ │ ├── ratings │ │ │ ├── cbt_ratings.py │ │ │ ├── math_ratings.py │ │ │ ├── rate_subdir.py │ │ │ ├── ratings.py │ │ │ ├── schema.sql │ │ │ └── sqlite_ratings.py │ │ ├── requirements-analysis.txt │ │ ├── requirements-colab.txt │ │ ├── requirements.txt │ │ ├── rl_loop │ │ │ ├── bootstrap.py │ │ │ ├── distributed_flags │ │ │ ├── distributed_flags_nr │ │ │ ├── example_buffer.py │ │ │ ├── fsdb.py │ │ │ ├── local_flags │ │ │ ├── local_integration_test.py │ │ │ ├── selfplay.py │ │ │ ├── shipname.py │ │ │ ├── train_and_validate.py │ │ │ └── update_resign_threshold.py │ │ ├── selfplay.py │ │ ├── sgf_wrapper.py │ │ ├── strategies.py │ │ ├── symmetries.py │ │ ├── test.sh │ │ ├── testing │ │ │ ├── Dockerfile.v2 │ │ │ ├── Makefile │ │ │ ├── README.md │ │ │ ├── bootstrap_v2.sh │ │ │ └── setup.sh │ │ ├── tests │ │ │ ├── __init__.py │ │ │ ├── example_game.sgf │ │ │ ├── run_tests.py │ │ │ ├── test_coords.py │ │ │ ├── test_dual_net.py │ │ │ ├── test_features.py │ │ │ ├── test_flags │ │ │ ├── test_go.py │ │ │ ├── test_mask_flags.py │ │ │ ├── test_mcts.py │ │ │ ├── test_preprocessing.py │ │ │ ├── test_sgf_wrapper.py │ │ │ ├── test_shipname.py │ │ │ ├── test_strategies.py │ │ │ ├── test_symmetries.py │ │ │ └── test_utils.py │ │ ├── train.py │ │ ├── tsconfig.json │ │ ├── utils.py │ │ └── validate.py │ │ ├── run.sh │ │ └── run_and_time.sh ├── mixtral8x22b │ ├── README.md │ ├── clm_datasets.py │ ├── config │ │ ├── config.yaml │ │ ├── dataset │ │ │ ├── c4_mlperf.yaml │ │ │ └── wikitext.yaml │ │ ├── experiment │ │ │ ├── convergence_template.yaml │ │ │ └── gbs256_tpu.yaml │ │ ├── model │ │ │ └── blank_model.yaml │ │ └── sched │ │ │ ├── CosineAnnealing.yaml │ │ │ └── WarmupHoldPolicy.yaml │ ├── docker │ │ ├── gpu │ │ │ ├── Dockerfile │ │ │ ├── Dockerfile.GCP │ │ │ ├── build_and_push_image.sh │ │ │ └── megatron_core.patch │ │ └── tpu │ │ │ ├── Dockerfile │ │ │ └── build_and_push_image.sh │ ├── download_dataset.py │ ├── file_utils.py │ ├── helm_context │ │ ├── Chart.yaml │ │ ├── selected-configuration.yaml │ │ ├── templates │ │ │ └── nemo-example.yaml │ │ └── values.yaml │ ├── mixtral80.json │ ├── mixtral822-instruct.json │ ├── mixtral822.json │ ├── mixtral87.json │ ├── mlperf_logging_utils.py │ ├── model_utils_gpu.py │ ├── model_utils_tpu.py │ ├── run_clm.py │ ├── scripts │ │ ├── gpu │ │ │ ├── checkpoint_download.py │ │ │ ├── dataset_preprocessing.py │ │ │ └── run.sub │ │ └── tpu │ │ │ └── distributed_checkpoint_saving.py │ └── trainer_utils_tpu.py ├── ncf │ ├── .gitignore │ ├── Dockerfile │ ├── README.md │ ├── alias_generator.py │ ├── convert.py │ ├── ncf.py │ ├── negative_sampling_cpp │ │ ├── negative_sampling.cpp │ │ ├── setup.py │ │ └── test.py │ ├── neumf.py │ ├── requirements.txt │ ├── run_and_time.sh │ └── utils.py ├── never-adopted │ ├── sentiment_analysis │ │ ├── README.md │ │ ├── download.py │ │ ├── download_dataset.sh │ │ ├── paddle │ │ │ ├── run_and_time.sh │ │ │ └── train.py │ │ ├── verify.py │ │ └── verify_dataset.sh │ └── speech_recognition │ │ ├── README.md │ │ ├── __init__.py │ │ ├── data │ │ ├── .gitignore │ │ ├── __init__.py │ │ ├── bucketing_sampler.py │ │ ├── data-LibriSpeech-ref-cksum.out │ │ ├── data_loader.py │ │ ├── librispeech.py │ │ ├── merge_manifests.py │ │ └── utils.py │ │ ├── download_dataset.sh │ │ ├── labels.json │ │ ├── pytorch │ │ ├── .gitignore │ │ ├── decoder.py │ │ ├── docker │ │ │ ├── Dockerfile.gpu │ │ │ ├── base.gpu │ │ │ ├── build-docker.sh │ │ │ └── run-dev.sh │ │ ├── eval_model.py │ │ ├── model.py │ │ ├── params.py │ │ ├── run_and_time.sh │ │ └── train.py │ │ └── verify_dataset.sh ├── resnet-tf1 │ ├── Dockerfile │ ├── README.md │ ├── log_stitch.py │ ├── official │ │ ├── .gitignore │ │ ├── Dockerfile.cpu │ │ ├── Dockerfile.gpu │ │ ├── README.md │ │ ├── __init__.py │ │ ├── benchmark │ │ │ └── datastore │ │ │ │ └── schema │ │ │ │ ├── benchmark_metric.json │ │ │ │ └── benchmark_run.json │ │ ├── requirements.txt │ │ ├── resnet │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── imagenet_main.py │ │ │ ├── imagenet_preprocessing.py │ │ │ ├── imagenet_test.py │ │ │ ├── layer_test.py │ │ │ ├── resnet_model.py │ │ │ └── resnet_run_loop.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── arg_parsers │ │ │ ├── __init__.py │ │ │ ├── parsers.py │ │ │ └── parsers_test.py │ │ │ ├── export │ │ │ ├── __init__.py │ │ │ ├── export.py │ │ │ └── export_test.py │ │ │ ├── logs │ │ │ ├── __init__.py │ │ │ ├── benchmark_uploader.py │ │ │ ├── hooks.py │ │ │ ├── hooks_helper.py │ │ │ ├── hooks_helper_test.py │ │ │ ├── hooks_test.py │ │ │ ├── logger.py │ │ │ ├── logger_test.py │ │ │ ├── metric_hook.py │ │ │ └── metric_hook_test.py │ │ │ ├── misc │ │ │ ├── __init__.py │ │ │ ├── model_helpers.py │ │ │ └── model_helpers_test.py │ │ │ └── testing │ │ │ ├── __init__.py │ │ │ ├── integration.py │ │ │ ├── pylint.rcfile │ │ │ ├── reference_data.py │ │ │ ├── reference_data │ │ │ ├── reference_data_test │ │ │ │ ├── dense │ │ │ │ │ ├── expected_graph │ │ │ │ │ ├── model.ckpt.data-00000-of-00001 │ │ │ │ │ ├── model.ckpt.index │ │ │ │ │ ├── results.json │ │ │ │ │ └── tf_version.json │ │ │ │ └── uniform_random │ │ │ │ │ ├── expected_graph │ │ │ │ │ ├── model.ckpt.data-00000-of-00001 │ │ │ │ │ ├── model.ckpt.index │ │ │ │ │ ├── results.json │ │ │ │ │ └── tf_version.json │ │ │ └── resnet │ │ │ │ ├── batch-size-32_bottleneck_projection_version-1_width-8_channels-4 │ │ │ │ ├── expected_graph │ │ │ │ ├── model.ckpt.data-00000-of-00001 │ │ │ │ ├── model.ckpt.index │ │ │ │ ├── results.json │ │ │ │ └── tf_version.json │ │ │ │ ├── batch-size-32_bottleneck_projection_version-2_width-8_channels-4 │ │ │ │ ├── expected_graph │ │ │ │ ├── model.ckpt.data-00000-of-00001 │ │ │ │ ├── model.ckpt.index │ │ │ │ ├── results.json │ │ │ │ └── tf_version.json │ │ │ │ ├── batch-size-32_bottleneck_version-1_width-8_channels-4 │ │ │ │ ├── expected_graph │ │ │ │ ├── model.ckpt.data-00000-of-00001 │ │ │ │ ├── model.ckpt.index │ │ │ │ ├── results.json │ │ │ │ └── tf_version.json │ │ │ │ ├── batch-size-32_bottleneck_version-2_width-8_channels-4 │ │ │ │ ├── expected_graph │ │ │ │ ├── model.ckpt.data-00000-of-00001 │ │ │ │ ├── model.ckpt.index │ │ │ │ ├── results.json │ │ │ │ └── tf_version.json │ │ │ │ ├── batch-size-32_building_projection_version-1_width-8_channels-4 │ │ │ │ ├── expected_graph │ │ │ │ ├── model.ckpt.data-00000-of-00001 │ │ │ │ ├── model.ckpt.index │ │ │ │ ├── results.json │ │ │ │ └── tf_version.json │ │ │ │ ├── batch-size-32_building_projection_version-2_width-8_channels-4 │ │ │ │ ├── expected_graph │ │ │ │ ├── model.ckpt.data-00000-of-00001 │ │ │ │ ├── model.ckpt.index │ │ │ │ ├── results.json │ │ │ │ └── tf_version.json │ │ │ │ ├── batch-size-32_building_version-1_width-8_channels-4 │ │ │ │ ├── expected_graph │ │ │ │ ├── model.ckpt.data-00000-of-00001 │ │ │ │ ├── model.ckpt.index │ │ │ │ ├── results.json │ │ │ │ └── tf_version.json │ │ │ │ ├── batch-size-32_building_version-2_width-8_channels-4 │ │ │ │ ├── expected_graph │ │ │ │ ├── model.ckpt.data-00000-of-00001 │ │ │ │ ├── model.ckpt.index │ │ │ │ ├── results.json │ │ │ │ └── tf_version.json │ │ │ │ └── batch_norm │ │ │ │ ├── expected_graph │ │ │ │ ├── model.ckpt.data-00000-of-00001 │ │ │ │ ├── model.ckpt.index │ │ │ │ ├── results.json │ │ │ │ └── tf_version.json │ │ │ ├── reference_data_test.py │ │ │ └── scripts │ │ │ └── presubmit.sh │ ├── official_diff.txt │ ├── preprocess.sh │ ├── requirements.txt │ ├── run.sh │ └── run_and_time.sh ├── resnet-tf2 │ ├── README.md │ ├── README_old.md │ ├── download_dataset.sh │ ├── tensorflow2 │ │ ├── common.py │ │ ├── imagenet_preprocessing.py │ │ ├── lars_optimizer.py │ │ ├── lars_util.py │ │ ├── resnet_ctl_imagenet_main.py │ │ ├── resnet_model.py │ │ ├── resnet_runnable.py │ │ └── tf2_common │ │ │ ├── modeling │ │ │ └── performance.py │ │ │ ├── training │ │ │ ├── controller.py │ │ │ ├── optimizer_v2modified.py │ │ │ ├── runnable.py │ │ │ ├── standard_runnable.py │ │ │ └── utils.py │ │ │ └── utils │ │ │ ├── flags │ │ │ ├── __init__.py │ │ │ ├── _base.py │ │ │ ├── _benchmark.py │ │ │ ├── _conventions.py │ │ │ ├── _device.py │ │ │ ├── _distribution.py │ │ │ ├── _misc.py │ │ │ ├── _performance.py │ │ │ └── core.py │ │ │ ├── logs │ │ │ ├── __init__.py │ │ │ ├── cloud_lib.py │ │ │ ├── hooks.py │ │ │ ├── hooks_helper.py │ │ │ ├── logger.py │ │ │ └── metric_hook.py │ │ │ ├── misc │ │ │ ├── distribution_utils.py │ │ │ ├── keras_utils.py │ │ │ ├── model_helpers.py │ │ │ └── tpu_lib.py │ │ │ └── mlp_log │ │ │ ├── __init__.py │ │ │ └── mlp_log.py │ └── verify_dataset.sh ├── rnnt │ └── pytorch │ │ ├── .dockerignore │ │ ├── Dockerfile │ │ ├── LICENSE │ │ ├── NOTICE │ │ ├── README.md │ │ ├── common │ │ ├── __init__.py │ │ ├── audio.py │ │ ├── data │ │ │ ├── __init__.py │ │ │ ├── dali │ │ │ │ ├── __init__.py │ │ │ │ ├── data_loader.py │ │ │ │ ├── iterator.py │ │ │ │ ├── pipeline.py │ │ │ │ └── sampler.py │ │ │ ├── dataset.py │ │ │ ├── features.py │ │ │ ├── helpers.py │ │ │ └── text.py │ │ ├── helpers.py │ │ ├── metrics.py │ │ ├── optimizers.py │ │ ├── rnn.py │ │ ├── sampler.py │ │ ├── tb_dllogger.py │ │ └── text │ │ │ ├── LICENSE │ │ │ ├── __init__.py │ │ │ ├── cleaners.py │ │ │ ├── numbers.py │ │ │ └── symbols.py │ │ ├── configs │ │ └── baseline_v3-1023sp.yaml │ │ ├── docker-compose.yaml │ │ ├── eval_model.py │ │ ├── inference.py │ │ ├── mlperf │ │ ├── __init__.py │ │ └── logging.py │ │ ├── requirements.txt │ │ ├── rnnt │ │ ├── config.py │ │ ├── decoder.py │ │ ├── loss.py │ │ └── model.py │ │ ├── rnnt_layers.svg │ │ ├── run_and_time.sh │ │ ├── scripts │ │ ├── create_sentencepieces.sh │ │ ├── docker │ │ │ ├── build.sh │ │ │ └── launch.sh │ │ ├── download_librispeech.sh │ │ ├── inference.sh │ │ ├── inference_benchmark.sh │ │ ├── preprocess_librispeech.sh │ │ ├── train.sh │ │ ├── train_bench.sh │ │ ├── train_debug.sh │ │ └── train_refactor.sh │ │ ├── tests │ │ ├── Dockerfile │ │ ├── requirements.txt │ │ └── rnnt │ │ │ └── dataset │ │ │ └── test_rnnt_wordpiece_tokenizer.py │ │ ├── train.py │ │ └── utils │ │ ├── __init__.py │ │ ├── convert_librispeech.py │ │ ├── download_librispeech.py │ │ ├── download_utils.py │ │ ├── inference_librispeech.csv │ │ ├── librispeech.csv │ │ └── preprocessing_utils.py ├── ssd-v1 │ ├── Dockerfile │ ├── download_dataset.sh │ ├── download_resnet34_backbone.sh │ ├── pth_to_pickle.py │ ├── requirements.txt │ └── ssd │ │ ├── README.md │ │ ├── base_model.py │ │ ├── bind_launch.py │ │ ├── coco.py │ │ ├── config_DGX1_32.sh │ │ ├── config_DGX1_multinode.sh │ │ ├── config_DGX1_singlenode.sh │ │ ├── distributed.py │ │ ├── eval.py │ │ ├── mlperf_logger.py │ │ ├── run.sub │ │ ├── run_and_time.sh │ │ ├── ssd300.py │ │ ├── train.py │ │ └── utils.py ├── transformer │ ├── README.md │ ├── data_download.py │ ├── download_data.sh │ ├── tensorflow │ │ ├── Dockerfile │ │ ├── bert │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── create_pretraining_data.py │ │ │ ├── extract_features.py │ │ │ ├── lamb_optimizer_v1.py │ │ │ ├── modeling.py │ │ │ ├── modeling_test.py │ │ │ ├── optimization.py │ │ │ ├── optimization_test.py │ │ │ ├── run_classifier.py │ │ │ ├── run_pretraining.py │ │ │ ├── run_squad.py │ │ │ ├── tokenization.py │ │ │ └── tokenization_test.py │ │ ├── process_data.py │ │ ├── requirements.txt │ │ ├── run_and_time.sh │ │ ├── run_preprocessing.sh │ │ ├── run_training.sh │ │ ├── transformer │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── compute_bleu.py │ │ │ ├── compute_bleu_test.py │ │ │ ├── data_download.py │ │ │ ├── model │ │ │ │ ├── __init__.py │ │ │ │ ├── attention_layer.py │ │ │ │ ├── beam_search.py │ │ │ │ ├── beam_search_test.py │ │ │ │ ├── embedding_layer.py │ │ │ │ ├── ffn_layer.py │ │ │ │ ├── model_params.py │ │ │ │ ├── model_utils.py │ │ │ │ ├── model_utils_test.py │ │ │ │ └── transformer.py │ │ │ ├── transformer_main.py │ │ │ ├── translate.py │ │ │ ├── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── dataset.py │ │ │ │ ├── metrics.py │ │ │ │ ├── tokenizer.py │ │ │ │ └── tokenizer_test.py │ │ │ └── vocab │ │ │ │ └── vocab.translate_ende_wmt32k.32768.subwords │ │ └── transformer_diff.txt │ └── verify_dataset.sh └── unet3d │ └── pytorch │ ├── Dockerfile │ ├── LICENCE │ ├── README.md │ ├── checksum.json │ ├── data_loading │ ├── data_loader.py │ └── pytorch_loader.py │ ├── evaluation_cases.txt │ ├── main.py │ ├── model │ ├── layers.py │ ├── losses.py │ └── unet3d.py │ ├── oldREADME.md │ ├── preprocess_dataset.py │ ├── requirements.txt │ ├── run_and_time.sh │ └── runtime │ ├── arguments.py │ ├── callbacks.py │ ├── distributed_utils.py │ ├── inference.py │ ├── logging.py │ └── training.py ├── single_stage_detector ├── .dockerignore ├── Dockerfile ├── README.md ├── mlcube │ ├── README.md │ └── mlcube.yaml ├── requirements.txt ├── scripts │ ├── backbone_to_onnx.py │ ├── download_backbone.sh │ ├── download_coco2017.sh │ ├── download_openimages_demo.sh │ ├── download_openimages_full.sh │ ├── download_openimages_mlperf.sh │ ├── fiftyone_openimages.py │ ├── pth_to_onnx.py │ └── pth_to_pickle.py └── ssd │ ├── .gitignore │ ├── LICENSE │ ├── bind.sh │ ├── check_logs.sh │ ├── coco_eval.py │ ├── coco_utils.py │ ├── config_DGXA100_001x08x032.sh │ ├── config_DGXA100_002x08x016.sh │ ├── config_DGXA100_008x08x004_inference_benchmark.sh │ ├── config_DGXA100_008x08x008.sh │ ├── config_DGXA100_032x08x032.sh │ ├── engine.py │ ├── model │ ├── __init__.py │ ├── anchor_utils.py │ ├── backbone_utils.py │ ├── boxes.py │ ├── feature_pyramid_network.py │ ├── focal_loss.py │ ├── image_list.py │ ├── resnet.py │ ├── retinanet.py │ ├── roi_heads.py │ ├── transform.py │ └── utils.py │ ├── presets.py │ ├── run.sub │ ├── run_and_time.sh │ ├── run_demo.sh │ ├── ssd_logger.py │ ├── train.py │ ├── transforms.py │ └── utils.py └── stable_diffusion ├── .dockerignore ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── configs ├── train_01x08x08.yaml ├── train_32x08x02.yaml ├── train_32x08x02_raw_images.yaml ├── train_32x08x04.yaml └── train_32x08x08.yaml ├── imgs └── overview.png ├── ldm ├── data │ ├── __init__.py │ ├── base.py │ ├── composable_data_module.py │ ├── tsv.py │ ├── utils.py │ └── webdatasets.py ├── lr_scheduler.py ├── models │ ├── autoencoder.py │ ├── clip_encoder.py │ └── diffusion │ │ ├── __init__.py │ │ ├── ddim.py │ │ ├── ddpm.py │ │ ├── dpm_solver │ │ ├── __init__.py │ │ ├── dpm_solver.py │ │ └── sampler.py │ │ ├── plms.py │ │ └── sampling_util.py ├── modules │ ├── attention.py │ ├── diffusionmodules │ │ ├── __init__.py │ │ ├── model.py │ │ ├── openaimodel.py │ │ ├── upscaling.py │ │ └── util.py │ ├── distributions │ │ ├── __init__.py │ │ └── distributions.py │ ├── ema.py │ ├── encoders │ │ ├── __init__.py │ │ └── modules.py │ └── fid │ │ ├── README.md │ │ ├── fid_score.py │ │ └── inception.py └── util.py ├── main.py ├── mlperf_logging_utils.py ├── requirements.txt ├── run_and_time.sh ├── scripts ├── checkpoints │ ├── download_clip.sh │ ├── download_inception.sh │ └── download_sd.sh ├── datasets │ ├── coco-2014-validation-download.sh │ ├── coco-2014-validation-split-resize.sh │ ├── coco-split-resize.py │ ├── coco2014-validation-download-prompts.sh │ ├── coco2014-validation-download-stats.sh │ ├── filter-metadata.py │ ├── generate-fid-statistics.sh │ ├── laion400m-convert-images-to-moments.sh │ ├── laion400m-download-dataset.sh │ ├── laion400m-download-metadata.sh │ ├── laion400m-filter-metadata.sh │ ├── laion400m-filtered-download-images.sh │ └── laion400m-filtered-download-moments.sh ├── docker │ ├── build.sh │ └── launch.sh └── slurm │ ├── sbatch.sh │ └── srun.sh └── webdataset_images2latents.py /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # These owners will be the default owners for everything in the repo. 2 | # Unless a later match takes precedence,they will be requested for review when someone opens a pull request. 3 | * @mlcommons/wg-training 4 | 5 | /CODEOWNERS @mlcommons/staff 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | single_stage_detector/mlcube/workspace/* 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "recommendation/dlrm"] 2 | path = retired_benchmarks/dlrm/dlrm 3 | url = https://github.com/facebookresearch/dlrm.git 4 | branch = mlperf 5 | -------------------------------------------------------------------------------- /graph_neural_network/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.13.0-cuda11.6-cudnn8-devel 2 | 3 | WORKDIR /workspace/repository 4 | 5 | RUN pip install torch==1.13.0+cu117 torchvision==0.14.0+cu117 torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cu117 6 | RUN pip install scikit-learn==0.24.2 7 | RUN pip install torch_geometric==2.4.0 8 | RUN pip install --no-index torch_scatter==2.1.1 torch_sparse==0.6.17 -f https://data.pyg.org/whl/torch-1.13.0+cu117.html 9 | RUN pip install graphlearn-torch==0.2.2 10 | 11 | RUN apt update 12 | RUN apt install -y git 13 | RUN pip install git+https://github.com/mlcommons/logging.git 14 | 15 | # TF32 instead of FP32 for faster compute 16 | ENV NVIDIA_TF32_OVERRIDE=1 17 | 18 | COPY .. 19 | WORKDIR /workspace/repository 20 | -------------------------------------------------------------------------------- /graph_neural_network/Dockerfile.h100: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:22.12-py3 2 | 3 | WORKDIR /workspace/repository 4 | 5 | RUN pip install scikit-learn==0.24.2 6 | RUN pip install torch_geometric==2.4.0 7 | RUN pip install torch_scatter==2.1.1 torch_sparse==0.6.17 8 | RUN pip install graphlearn-torch==0.2.2 9 | 10 | RUN apt update 11 | RUN apt install -y git 12 | RUN pip install git+https://github.com/mlcommons/logging.git 13 | 14 | # TF32 instead of FP32 for faster compute 15 | ENV NVIDIA_TF32_OVERRIDE=1 16 | 17 | COPY . . 18 | WORKDIR /workspace/repository 19 | 20 | RUN git clone https://github.com/alibaba/graphlearn-for-pytorch.git 21 | WORKDIR /workspace/repository/graphlearn-for-pytorch 22 | RUN git checkout 910cb55 23 | RUN git submodule update --init 24 | -------------------------------------------------------------------------------- /graph_neural_network/utilities.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | def create_ckpt_folder(base_dir, prefix="ckpt"): 5 | timestamp = time.strftime("%Y%m%d-%H%M%S") 6 | folder_name = f"{prefix}_{timestamp}" if prefix else timestamp 7 | full_path = os.path.join(base_dir, folder_name) 8 | if not os.path.exists(full_path): 9 | os.makedirs(full_path) 10 | return full_path 11 | 12 | -------------------------------------------------------------------------------- /language_model/tensorflow/bert/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /large_language_model_pretraining/nemo/utils/launch_nemo_convert.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -N 1 3 | #SBATCH --gpus-per-node 1 4 | #SBATCH -t 02:00:00 5 | #SBATCH --mem=0 6 | 7 | set -e 8 | 9 | : "${CONT_IMAGE_URL:?CONT_IMAGE_URL not set}" 10 | : "${SRC_PATH:?SRC_PATH not set}" 11 | : "${DST_PATH:?DST_PATH not set}" 12 | 13 | working_dir=$(dirname -- ${BASH_SOURCE[0]}) 14 | 15 | if [ ! -d $DST_PATH ]; then 16 | mkdir -p $DST_PATH 17 | fi 18 | 19 | container_maps="${SRC_PATH}:/source,${DST_PATH}:/destination,${working_dir}:/workspace/utils" 20 | 21 | srun --nodes=1 --ntasks-per-node=1 \ 22 | --container-image=$CONT_IMAGE_URL --container-mounts $container_maps --no-container-entrypoint \ 23 | python3 /workspace/utils/convert.py --source /source --destination /destination 24 | -------------------------------------------------------------------------------- /large_language_model_pretraining/nemo/utils/nemo_convert.py: -------------------------------------------------------------------------------- 1 | if __name__ == "__main__": 2 | import argparse 3 | from nemo.collections.llm.gpt.model.llama import HFLlamaImporter 4 | parser = argparse.ArgumentParser() 5 | parser.add_argument("--source", default="/source", type=str) 6 | parser.add_argument("--destination", default="/destination", type=str) 7 | args = parser.parse_args() 8 | 9 | importer = HFLlamaImporter(args.source) 10 | importer.apply(args.destination) 11 | -------------------------------------------------------------------------------- /llama2_70b_lora/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:24.01-py3 2 | FROM ${FROM_IMAGE_NAME} 3 | 4 | WORKDIR /workspace/ft-llm 5 | ADD . /workspace/ft-llm 6 | 7 | RUN pip install -r requirements.txt 8 | RUN pip install flash-attn==2.4.1 --no-build-isolation 9 | -------------------------------------------------------------------------------- /llama2_70b_lora/configs/default_config.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | gradient_clipping: 0.3 5 | gradient_accumulation_steps: 1 6 | offload_optimizer_device: none 7 | offload_param_device: none 8 | zero3_init_flag: true 9 | zero3_save_16bit_model: true 10 | zero_stage: 3 11 | distributed_type: DEEPSPEED 12 | downcast_bf16: 'no' 13 | machine_rank: 0 14 | main_training_function: main 15 | mixed_precision: bf16 16 | num_machines: 1 17 | num_processes: 8 18 | rdzv_backend: static 19 | same_network: true 20 | tpu_env: [] 21 | tpu_use_cluster: false 22 | tpu_use_sudo: false 23 | use_cpu: false 24 | -------------------------------------------------------------------------------- /llama2_70b_lora/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/mlcommons/logging.git 2 | transformers==4.38.1 3 | accelerate==0.27.2 4 | peft==0.8.2 5 | datasets==2.17.1 6 | deepspeed==0.13.2 -------------------------------------------------------------------------------- /llama2_70b_lora/run_docker.sh: -------------------------------------------------------------------------------- 1 | docker pull nvcr.io/nvidia/pytorch:23.09-py3 2 | docker run -v path_to_my_folder:/root/workspace --workdir /root/workspace --gpus all -it --rm --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 nvcr.io/nvidia/pytorch:23.09-py3 3 | -------------------------------------------------------------------------------- /llama2_70b_lora/run_llama_70B_scrolls_r16.sh: -------------------------------------------------------------------------------- 1 | accelerate launch --config_file configs/default_config.yaml scripts/train.py \ 2 | --dataset_path "./dataset" \ 3 | --model_path "./models/llama-v2-fused-qkv" \ 4 | --max_seq_len 8192 \ 5 | --bf16 True \ 6 | --logging_steps 24 \ 7 | --eval_steps 48 \ 8 | --output_dir "./results/llama-70b_scrolls_gov_report_r16_$1" \ 9 | --per_device_train_batch_size 1 \ 10 | --gradient_accumulation_steps 1 \ 11 | --lr_scheduler_type "cosine" \ 12 | --learning_rate 4e-4 \ 13 | --weight_decay 0.0001 \ 14 | --warmup_ratio 0 \ 15 | --max_grad_norm 0.3 \ 16 | --use_gradient_checkpointing True \ 17 | --target_eval_loss 0.925 \ 18 | --use_peft_lora True \ 19 | --lora_r 16 \ 20 | --lora_alpha 32 \ 21 | --lora_dropout 0.1 \ 22 | --max_steps 1024 \ 23 | --use_flash_attn \ 24 | --seed "$1" \ 25 | --lora_target_modules "qkv_proj,o_proj" 26 | -------------------------------------------------------------------------------- /recommendation_v2/torchrec_dlrm/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG FROM_IMAGE_NAME=pytorch/pytorch:1.13.1-cuda11.6-cudnn8-runtime 2 | FROM ${FROM_IMAGE_NAME} 3 | 4 | RUN apt-get -y update && \ 5 | apt-get -y install git 6 | 7 | WORKDIR /workspace/torchrec_dlrm 8 | COPY . . 9 | 10 | RUN pip install --no-cache-dir -r requirements.txt 11 | -------------------------------------------------------------------------------- /recommendation_v2/torchrec_dlrm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/recommendation_v2/torchrec_dlrm/__init__.py -------------------------------------------------------------------------------- /recommendation_v2/torchrec_dlrm/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/recommendation_v2/torchrec_dlrm/data/__init__.py -------------------------------------------------------------------------------- /recommendation_v2/torchrec_dlrm/mlperf_logging_utils.py: -------------------------------------------------------------------------------- 1 | from mlperf_logging.mllog import constants 2 | from mlperf_logging.mllog.mllog import MLLogger 3 | 4 | 5 | def submission_info(mllogger: MLLogger, benchmark_name: str, submitter_name: str): 6 | """Logs required for a valid MLPerf submission.""" 7 | mllogger.event( 8 | key=constants.SUBMISSION_BENCHMARK, 9 | value=benchmark_name, 10 | ) 11 | mllogger.event( 12 | key=constants.SUBMISSION_ORG, 13 | value=submitter_name, 14 | ) 15 | mllogger.event( 16 | key=constants.SUBMISSION_DIVISION, 17 | value=constants.CLOSED, 18 | ) 19 | mllogger.event( 20 | key=constants.SUBMISSION_STATUS, 21 | value=constants.ONPREM, 22 | ) 23 | mllogger.event( 24 | key=constants.SUBMISSION_PLATFORM, 25 | value=submitter_name, 26 | ) 27 | -------------------------------------------------------------------------------- /recommendation_v2/torchrec_dlrm/requirements.txt: -------------------------------------------------------------------------------- 1 | fbgemm-gpu==0.3.2 2 | git+https://github.com/mlperf/logging.git 3 | torchmetrics==0.11.0 4 | torchrec==0.3.2 5 | -------------------------------------------------------------------------------- /reference_results.md: -------------------------------------------------------------------------------- 1 | The following table shows reference results, to be used to normalized benchmark results. 2 | **These results are NOT for optimized code and do NOT measure framework or hardware performance.** 3 | Individual seed results are provided only to show variance and should not be used for normalization. 4 | 5 | Benchmark|Reference Result|Seed 1|Seed 2|Seed 3|Seed 4|Seed 5 6 | ---|---|---|---|---|---|--- 7 | Image classification|529877|530571|529438|530373|523480|529877 8 | Object detection|299971|298906|299971|300471|298915|327101 9 | Translation|112187|111790|112187|111760|112879|149175 10 | Speech recognition|412417|405780|532790|450667|412417|344806 11 | Recommendation|2803|2823|2802|2803|2912|2541 12 | Sentiment analysis|214|287|207|210|214|324 13 | Reinforcement learning|263322|279252|277831|223113|260951|263322 14 | -------------------------------------------------------------------------------- /retired_benchmarks/dlrm/download_dataset.sh: -------------------------------------------------------------------------------- 1 | function download_20m { 2 | echo "Download ml-20m" 3 | curl -O http://files.grouplens.org/datasets/movielens/ml-20m.zip 4 | } 5 | 6 | function download_1m { 7 | echo "Downloading ml-1m" 8 | curl -O http://files.grouplens.org/datasets/movielens/ml-1m.zip 9 | } 10 | 11 | if [[ $1 == "ml-1m" ]] 12 | then 13 | download_1m 14 | else 15 | download_20m 16 | fi 17 | -------------------------------------------------------------------------------- /retired_benchmarks/dlrm/verify_dataset.sh: -------------------------------------------------------------------------------- 1 | function get_checker { 2 | if [[ "$OSTYPE" == "darwin"* ]]; then 3 | checkmd5=md5 4 | else 5 | checkmd5=md5sum 6 | fi 7 | 8 | echo $checkmd5 9 | } 10 | 11 | 12 | function verify_1m { 13 | # From: curl -O http://files.grouplens.org/datasets/movielens/ml-1m.zip.md5 14 | hash=<(echo "MD5 (ml-1m.zip) = c4d9eecfca2ab87c1945afe126590906") 15 | local checkmd5=$(get_checker) 16 | if diff <($checkmd5 ml-1m.zip) $hash &> /dev/null 17 | then 18 | echo "PASSED" 19 | else 20 | echo "FAILED" 21 | fi 22 | } 23 | 24 | function verify_20m { 25 | # From: curl -O http://files.grouplens.org/datasets/movielens/ml-20m.zip.md5 26 | hash=<(echo "MD5 (ml-20m.zip) = cd245b17a1ae2cc31bb14903e1204af3") 27 | local checkmd5=$(get_checker) 28 | 29 | if diff <($checkmd5 ml-20m.zip) $hash &> /dev/null 30 | then 31 | echo "PASSED" 32 | else 33 | echo "FAILED" 34 | fi 35 | 36 | } 37 | 38 | 39 | if [[ $1 == "ml-1m" ]] 40 | then 41 | verify_1m 42 | else 43 | verify_20m 44 | fi 45 | -------------------------------------------------------------------------------- /retired_benchmarks/gnmt/.dockerignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | tags 3 | /data 4 | /results 5 | *.log 6 | -------------------------------------------------------------------------------- /retired_benchmarks/gnmt/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | tags 3 | /data 4 | /results 5 | *.log 6 | -------------------------------------------------------------------------------- /retired_benchmarks/gnmt/pytorch/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.0.1-cuda10.0-cudnn7-runtime 2 | 3 | ENV LANG C.UTF-8 4 | ENV LC_ALL C.UTF-8 5 | 6 | ADD . /workspace/pytorch 7 | WORKDIR /workspace/pytorch 8 | 9 | RUN pip install -r requirements.txt 10 | -------------------------------------------------------------------------------- /retired_benchmarks/gnmt/pytorch/requirements.txt: -------------------------------------------------------------------------------- 1 | sacrebleu==1.2.10 2 | git+git://github.com/NVIDIA/apex.git@9041a868a1a253172d94b113a963375b9badd030#egg=apex 3 | mlperf-compliance==0.0.10 4 | -------------------------------------------------------------------------------- /retired_benchmarks/gnmt/pytorch/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | DATASET_DIR='/data' 6 | 7 | SEED=${1:-"1"} 8 | TARGET=${2:-"24.00"} 9 | 10 | # run training 11 | python3 train.py \ 12 | --dataset-dir ${DATASET_DIR} \ 13 | --seed $SEED \ 14 | --target-bleu $TARGET 15 | -------------------------------------------------------------------------------- /retired_benchmarks/gnmt/pytorch/run_and_time.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # runs benchmark and reports time to convergence 4 | # to use the script: 5 | # run_and_time.sh 6 | 7 | set -e 8 | 9 | # start timing 10 | start=$(date +%s) 11 | start_fmt=$(date +%Y-%m-%d\ %r) 12 | echo "STARTING TIMING RUN AT $start_fmt" 13 | 14 | # run benchmark 15 | seed=${1:-"1"} 16 | target=24.00 17 | 18 | echo "running benchmark" 19 | ./run.sh $seed $target 20 | 21 | sleep 3 22 | ret_code=$?; if [[ $ret_code != 0 ]]; then exit $ret_code; fi 23 | 24 | # end timing 25 | end=$(date +%s) 26 | end_fmt=$(date +%Y-%m-%d\ %r) 27 | echo "ENDING TIMING RUN AT $end_fmt" 28 | 29 | # report result 30 | result=$(( $end - $start )) 31 | result_name="RNN_TRANSLATOR" 32 | 33 | echo "RESULT,$result_name,$seed,$result,$USER,$start_fmt" 34 | -------------------------------------------------------------------------------- /retired_benchmarks/gnmt/pytorch/scripts/docker/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker build . --rm -t gnmt:latest 4 | -------------------------------------------------------------------------------- /retired_benchmarks/gnmt/pytorch/scripts/docker/interactive.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nvidia-docker run -it --rm --ipc=host -v $PWD:/workspace/gnmt/ gnmt bash 4 | -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/.gitignore: -------------------------------------------------------------------------------- 1 | megatron/__pycache__/ 2 | megatron/data/__pycache__/ 3 | megatron/model/__pycache__/ 4 | megatron/mpu/__pycache__/ 5 | megatron/optimizer/__pycache__/ 6 | megatron/tokenizer/__pycache__/ 7 | megatron/fused_kernels/__pycache__/ 8 | megatron/fused_kernels/build/ 9 | -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:24.04-py3 2 | FROM ${FROM_IMAGE_NAME} 3 | 4 | # Copy code 5 | WORKDIR /workspace/llm 6 | COPY . . 7 | RUN pip install -r requirements.txt 8 | ENV PYTHONPATH "/workspace/llm:${PYTHONPATH}" 9 | -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/checksums/additional_checkpoint_files/common.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/gpt3/megatron-lm/checksums/additional_checkpoint_files/common.pt -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/checksums/additional_checkpoint_files/metadata.json: -------------------------------------------------------------------------------- 1 | {"sharded_backend": "zarr"} 2 | -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/checksums/dataset_checksum.log: -------------------------------------------------------------------------------- 1 | 317a1c1b0b17fbd658e3e0b09d118ce9 c4_en_6_c4_spm_text_document.bin 2 | 5c8cfe37a26f919fb3998e84d1d07d8e c4_en_6_c4_spm_text_document.idx 3 | 5a84af04d55765993ecb5461af56b718 c4_en_7_c4_spm_text_document.bin 4 | 35b23332069840094e1a75332cdeab62 c4_en_7_c4_spm_text_document.idx 5 | 20d868f6cb865ce616ce7b9cf8312be0 c4_en_validation_subset_c4_spm_text_document.bin 6 | f76050809d0b42611eeef31d67d04224 c4_en_validation_subset_c4_spm_text_document.idx -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | 3 | from .core import check_is_distributed_checkpoint 4 | from .mapping import ShardedTensor, LocalNonpersitentObject 5 | from .serialization import load, save, load_common_state_dict -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/strategies/__init__.py: -------------------------------------------------------------------------------- 1 | """ Various loading and saving strategies """ 2 | 3 | try: 4 | import zarr 5 | import tensorstore 6 | from .zarr import _import_trigger 7 | except ImportError: 8 | print('Zarr strategies will not be registered because of missing packages') 9 | -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/gpt3/megatron-lm/megatron/core/dist_checkpointing/tests/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/megatron/data/Makefile: -------------------------------------------------------------------------------- 1 | CXXFLAGS += -O3 -Wall -shared -std=c++11 -fPIC -fdiagnostics-color 2 | CPPFLAGS += $(shell python3 -m pybind11 --includes) 3 | LIBNAME = helpers 4 | LIBEXT = $(shell python3-config --extension-suffix) 5 | 6 | default: $(LIBNAME)$(LIBEXT) 7 | 8 | %$(LIBEXT): %.cpp 9 | $(CXX) $(CXXFLAGS) $(CPPFLAGS) $< -o $@ 10 | -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/megatron/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import indexed_dataset 2 | -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/megatron/data/test/test_preprocess_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | IMPL=cached 4 | python ../preprocess_data.py \ 5 | --input test_samples.json \ 6 | --vocab vocab.txt \ 7 | --dataset-impl ${IMPL} \ 8 | --output-prefix test_samples_${IMPL} \ 9 | --workers 1 \ 10 | --log-interval 2 11 | -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/compat.h: -------------------------------------------------------------------------------- 1 | /* coding=utf-8 2 | * Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /*This code is copied fron NVIDIA apex: 18 | * https://github.com/NVIDIA/apex 19 | * with minor changes. */ 20 | 21 | 22 | 23 | #ifndef TORCH_CHECK 24 | #define TORCH_CHECK AT_CHECK 25 | #endif 26 | 27 | #ifdef VERSION_GE_1_3 28 | #define DATA_PTR data_ptr 29 | #else 30 | #define DATA_PTR data 31 | #endif 32 | -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/gpt3/megatron-lm/megatron/fused_kernels/tests/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/megatron/model/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | #from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm 17 | from .fused_layer_norm import MixedFusedLayerNorm1P as LayerNorm 18 | 19 | from .distributed import DistributedDataParallel 20 | from .gpt_model import GPTModel 21 | from .language_model import get_language_model 22 | from .module import Float16Module 23 | from .enums import ModelType 24 | -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/megatron/model/enums.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import enum 17 | 18 | class ModelType(enum.Enum): 19 | encoder_or_decoder = 1 20 | encoder_and_decoder = 2 21 | 22 | class LayerType(enum.Enum): 23 | encoder = 1 24 | decoder = 2 25 | 26 | class AttnType(enum.Enum): 27 | self_attn = 1 28 | cross_attn = 2 29 | 30 | class AttnMaskType(enum.Enum): 31 | padding = 1 32 | causal = 2 33 | -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/megatron/mpu/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/gpt3/megatron-lm/megatron/mpu/tests/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/megatron/text_generation/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from .api import ( 18 | generate, 19 | generate_and_post_process, 20 | beam_search_and_post_process) 21 | -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/megatron/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | 17 | from .tokenizer import build_tokenizer 18 | -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/mlcommons/logging.git@2.1.0-rc1 2 | git+https://github.com/NVIDIA/mlperf-common.git 3 | zarr==2.13 4 | tensorstore==0.1.27 5 | -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/scripts/common_bf16.json: -------------------------------------------------------------------------------- 1 | { 2 | "optimizer": { 3 | "optimizer": { 4 | "param_groups": [ 5 | { 6 | "wd_mult": 1.0, 7 | "lr_mult": 1.0, 8 | "lr": 5.984178321979609e-05, 9 | "bias_correction": true, 10 | "betas": [ 11 | 0.9, 12 | 0.95 13 | ], 14 | "eps": 1e-08, 15 | "weight_decay": 0.1, 16 | "step": 4000 17 | } 18 | ] 19 | } 20 | } 21 | } -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/scripts/common_fp32.json: -------------------------------------------------------------------------------- 1 | { 2 | "optimizer": { 3 | "param_groups": [ 4 | { 5 | "wd_mult": 1.0, 6 | "lr_mult": 1.0, 7 | "lr": 5.984178321979609e-05, 8 | "bias_correction": true, 9 | "betas": [ 10 | 0.9, 11 | 0.95 12 | ], 13 | "eps": 1e-08, 14 | "weight_decay": 0.1, 15 | "step": 4000 16 | } 17 | ] 18 | } 19 | } -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/scripts/load_checkpoint.md: -------------------------------------------------------------------------------- 1 | # Load checkpoint 2 | 3 | This is an example script to load the checkpoint using PyTorch for LLM benchmark. 4 | 5 | ## Requirement 6 | 7 | Megatron 8 | PyTorch 9 | 10 | ## Usage 11 | 12 | Assuming that the checkpoint has been downloaded to `/data`, the following command 13 | will load the state_dict for all model parallel units. 14 | 15 | ``` 16 | python3 scripts/load_checkpoint.py \ 17 | --input_path /data/iter_0000300 \ 18 | --tensor-model-parallel-size 8 \ 19 | --pipeline-model-parallel-size 8 20 | ``` 21 | 22 | Each pickle file is ~37GB and the data is loaded into a list of state_dicts for each model parallel unit. 23 | 24 | The script has been tested using Python 3.8.12 and PyTorch 1.11.0 25 | -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/scripts/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -N 1 3 | #SBATCH --exclusive 4 | #SBATCH --dependency=singleton 5 | #SBATCH --mem=0 6 | #SBATCH --array=6-7%8 7 | #SBATCH --requeue 8 | 9 | C4_PATH=$1 10 | 11 | srun --container-image nvcr.io/nvidia/pytorch:21.12-py3 \ 12 | --container-mounts ${C4_PATH}:${C4_PATH} \ 13 | bash -c \ 14 | " git clone https://github.com/NVIDIA/NeMo.git; \ 15 | cd NeMo && git checkout f3ad584b94170bc3ea197df29eb9ef9c96061730 && bash ./reinstall.sh; \ 16 | python /workspace/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \ 17 | --input ${C4_PATH}/en_merge/c4-train.en_\${SLURM_ARRAY_TASK_ID}.json.gz \ 18 | --tokenizer-library sentencepiece \ 19 | --tokenizer-model ${C4_PATH}/tokenizers/c4_spm/sentencepiece.model \ 20 | --output-prefix ${C4_PATH}/preprocessed_c4_spm/c4_en_\${SLURM_ARRAY_TASK_ID}_c4_spm_text_document \ 21 | --dataset-impl mmap \ 22 | --workers 128 " 23 | -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/megatron-lm/scripts/preprocess_val.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -N 1 3 | #SBATCH --exclusive 4 | #SBATCH --dependency=singleton 5 | #SBATCH --mem=0 6 | #SBATCH --requeue 7 | 8 | C4_PATH=$1 9 | VALIDATION_JSON_PATH=$2 10 | 11 | srun --container-image nvcr.io/nvidia/pytorch:21.12-py3 \ 12 | --container-mounts ${C4_PATH}:${C4_PATH} \ 13 | bash -c \ 14 | " git clone https://github.com/NVIDIA/NeMo.git; \ 15 | cd NeMo && git checkout f3ad584b94170bc3ea197df29eb9ef9c96061730 && bash ./reinstall.sh; \ 16 | python /workspace/NeMo/scripts/nlp_language_modeling/preprocess_data_for_megatron.py \ 17 | --input ${VALIDATION_JSON_PATH} \ 18 | --tokenizer-library sentencepiece \ 19 | --tokenizer-model ${C4_PATH}/tokenizers/c4_spm/sentencepiece.model \ 20 | --output-prefix ${C4_PATH}/preprocessed_c4_spm/c4_en_validation_subset_c4_spm_text_document \ 21 | --dataset-impl mmap \ 22 | --workers 128 " 23 | -------------------------------------------------------------------------------- /retired_benchmarks/gpt3/paxml/utils/load_ts_ckpt.py: -------------------------------------------------------------------------------- 1 | # Lint as: python3 2 | """Script to load layer(s) of the LLM checkpoint using TensorStore. 3 | More details about TensorStore, please visit 4 | https://github.com/google/tensorstore . 5 | """ 6 | 7 | import argparse 8 | import tensorstore as ts 9 | 10 | parser = argparse.ArgumentParser(description='Checkpoint loading for LLM.') 11 | parser.add_argument( 12 | '--input_path', 13 | type=str, 14 | default='', 15 | help='Input directory for layer(s) of the saved checkpoint.') 16 | args = parser.parse_args() 17 | 18 | if __name__ == '__main__': 19 | input_path = args.input_path 20 | spec = {'driver': 'zarr', 'kvstore': {}} 21 | spec['kvstore'] = { 22 | 'driver': 'file', 23 | 'path': input_path, 24 | } 25 | t = ts.open(ts.Spec(spec), open=True).result() 26 | t_v = t.read().result() 27 | 28 | print("path = ", input_path, 29 | ", type = ", type(t_v), 30 | ", shape = ", t_v.shape) 31 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/download_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Get COCO 2017 data sets 4 | mkdir -p pytorch/datasets/coco 5 | pushd pytorch/datasets/coco 6 | 7 | curl -O https://dl.fbaipublicfiles.com/detectron/coco/coco_annotations_minival.tgz 8 | tar xzf coco_annotations_minival.tgz 9 | 10 | curl -O http://images.cocodataset.org/zips/train2017.zip 11 | unzip train2017.zip 12 | 13 | curl -O http://images.cocodataset.org/zips/val2017.zip 14 | unzip val2017.zip 15 | 16 | curl -O http://images.cocodataset.org/annotations/annotations_trainval2017.zip 17 | unzip annotations_trainval2017.zip 18 | 19 | # TBD: MD5 verification 20 | # $md5sum *.zip *.tgz 21 | #f4bbac642086de4f52a3fdda2de5fa2c annotations_trainval2017.zip 22 | #cced6f7f71b7629ddf16f17bbcfab6b2 train2017.zip 23 | #442b8da7639aecaf257c1dceb8ba8c80 val2017.zip 24 | #2d2b9d2283adb5e3b8d25eec88e65064 coco_annotations_minival.tgz 25 | 26 | popd 27 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/.flake8: -------------------------------------------------------------------------------- 1 | # This is an example .flake8 config, used when developing *Black* itself. 2 | # Keep in sync with setup.cfg which is used for source packages. 3 | 4 | [flake8] 5 | ignore = E203, E266, E501, W503 6 | max-line-length = 80 7 | max-complexity = 18 8 | select = B,C,E,F,W,T4,B9 9 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F680Feature Request" 3 | about: Submit a proposal/request for a new Mask R-CNN Benchmark feature 4 | 5 | --- 6 | 7 | ## 🚀 Feature 8 | 9 | 10 | ## Motivation 11 | 12 | 13 | 14 | ## Pitch 15 | 16 | 17 | 18 | ## Alternatives 19 | 20 | 21 | 22 | ## Additional context 23 | 24 | 25 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/.github/ISSUE_TEMPLATE/questions-help-support.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "❓Questions/Help/Support" 3 | about: Do you need support? 4 | 5 | --- 6 | 7 | ## ❓ Questions and Help 8 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/.gitignore: -------------------------------------------------------------------------------- 1 | # compilation and distribution 2 | __pycache__ 3 | _ext 4 | *.pyc 5 | *.so 6 | maskrcnn_benchmark.egg-info/ 7 | build/ 8 | dist/ 9 | 10 | # pytorch/python/numpy formats 11 | *.pth 12 | *.pkl 13 | *.npy 14 | 15 | # ipython/jupyter notebooks 16 | *.ipynb 17 | **/.ipynb_checkpoints/ 18 | 19 | # Editor temporaries 20 | *.swn 21 | *.swo 22 | *.swp 23 | *~ 24 | 25 | # Pycharm editor settings 26 | .idea 27 | 28 | # project dirs 29 | /datasets 30 | /models 31 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to. 4 | Please read the [full text](https://code.fb.com/codeofconduct/) 5 | so that you can understand what actions will and will not be tolerated. 6 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/demo/demo_e2e_mask_rcnn_R_50_FPN_1x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/maskrcnn/pytorch/demo/demo_e2e_mask_rcnn_R_50_FPN_1x.png -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/demo/demo_e2e_mask_rcnn_X_101_32x8d_FPN_1x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/maskrcnn/pytorch/demo/demo_e2e_mask_rcnn_X_101_32x8d_FPN_1x.png -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 15 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/config/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 15 | from .defaults import _C as cfg 16 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/csrc/cpu/vision.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include 4 | 5 | 6 | at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, 7 | const at::Tensor& rois, 8 | const float spatial_scale, 9 | const int pooled_height, 10 | const int pooled_width, 11 | const int sampling_ratio); 12 | 13 | 14 | at::Tensor nms_cpu(const at::Tensor& dets, 15 | const at::Tensor& scores, 16 | const float threshold); 17 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/csrc/nms.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | 10 | at::Tensor nms(const at::Tensor& dets, 11 | const at::Tensor& scores, 12 | const float threshold) { 13 | 14 | if (dets.type().is_cuda()) { 15 | #ifdef WITH_CUDA 16 | // TODO raise error if not compiled with CUDA 17 | if (dets.numel() == 0) 18 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 19 | auto b = at::cat({dets, scores.unsqueeze(1)}, 1); 20 | return nms_cuda(b, threshold); 21 | #else 22 | AT_ERROR("Not compiled with GPU support"); 23 | #endif 24 | } 25 | 26 | at::Tensor result = nms_cpu(dets, scores, threshold); 27 | return result; 28 | } 29 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/csrc/vision.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include "nms.h" 3 | #include "ROIAlign.h" 4 | #include "ROIPool.h" 5 | #include "SigmoidFocalLoss.h" 6 | 7 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 8 | m.def("nms", &nms, "non-maximum suppression"); 9 | m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward"); 10 | m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward"); 11 | m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward"); 12 | m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward"); 13 | m.def("sigmoid_focalloss_forward", &SigmoidFocalLoss_forward, "SigmoidFocalLoss_forward"); 14 | m.def("sigmoid_focalloss_backward", &SigmoidFocalLoss_backward, "SigmoidFocalLoss_backward"); 15 | } 16 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 15 | from .build import make_data_loader 16 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 15 | from .coco import COCODataset 16 | from .voc import PascalVOCDataset 17 | from .concat_dataset import ConcatDataset 18 | 19 | __all__ = ["COCODataset", "ConcatDataset", "PascalVOCDataset"] 20 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/data/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 15 | from .distributed import DistributedSampler 16 | from .grouped_batch_sampler import GroupedBatchSampler 17 | from .iteration_based_batch_sampler import IterationBasedBatchSampler 18 | 19 | __all__ = ["DistributedSampler", "GroupedBatchSampler", "IterationBasedBatchSampler"] 20 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/data/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 15 | from .transforms import Compose 16 | from .transforms import Resize 17 | from .transforms import RandomHorizontalFlip 18 | from .transforms import ToTensor 19 | from .transforms import Normalize 20 | 21 | from .build import build_transforms 22 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/engine/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 15 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/layers/nms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 15 | # from ._utils import _C 16 | from maskrcnn_benchmark import _C 17 | 18 | nms = _C.nms 19 | # nms.__doc__ = """ 20 | # This function performs Non-maximum suppresion""" 21 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 15 | from .backbone import build_backbone 16 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/modeling/detector/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 15 | from .detectors import build_detection_model 16 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/modeling/detector/detectors.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 15 | from .generalized_rcnn import GeneralizedRCNN 16 | 17 | 18 | _DETECTION_META_ARCHITECTURES = {"GeneralizedRCNN": GeneralizedRCNN} 19 | 20 | 21 | def build_detection_model(cfg): 22 | meta_arch = _DETECTION_META_ARCHITECTURES[cfg.MODEL.META_ARCHITECTURE] 23 | return meta_arch(cfg) 24 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/modeling/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 15 | 16 | from maskrcnn_benchmark.utils.registry import Registry 17 | 18 | BACKBONES = Registry() 19 | ROI_BOX_FEATURE_EXTRACTORS = Registry() 20 | ROI_BOX_PREDICTOR = Registry() 21 | RPN_HEADS = Registry() 22 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/modeling/roi_heads/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/modeling/roi_heads/box_head/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/modeling/roi_heads/keypoint_head/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/modeling/roi_heads/mask_head/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/modeling/rpn/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 15 | # from .rpn import build_rpn 16 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/modeling/rpn/retinanet/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/solver/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 15 | from .build import make_optimizer 16 | from .build import make_lr_scheduler 17 | from .lr_scheduler import WarmupMultiStepLR 18 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/structures/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/utils/README.md: -------------------------------------------------------------------------------- 1 | # Utility functions 2 | 3 | This folder contain utility functions that are not used in the 4 | core library, but are useful for building models or training 5 | code using the config system. 6 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/pytorch/maskrcnn_benchmark/utils/miscellaneous.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 15 | import errno 16 | import os 17 | 18 | 19 | def mkdir(path): 20 | try: 21 | os.makedirs(path) 22 | except OSError as e: 23 | if e.errno != errno.EEXIST: 24 | raise 25 | -------------------------------------------------------------------------------- /retired_benchmarks/maskrcnn/run_and_time.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Runs benchmark and reports time to convergence 4 | 5 | pushd pytorch 6 | 7 | # Single GPU training 8 | time python tools/train_mlperf.py --config-file "configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" \ 9 | SOLVER.IMS_PER_BATCH 2 TEST.IMS_PER_BATCH 1 SOLVER.MAX_ITER 720000 SOLVER.STEPS "(480000, 640000)" SOLVER.BASE_LR 0.0025 10 | 11 | popd 12 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04 2 | WORKDIR /research 3 | RUN apt-get update 4 | RUN apt-get update && apt-get install -y --no-install-recommends \ 5 | ca-certificates \ 6 | build-essential \ 7 | git \ 8 | python \ 9 | python-pip 10 | ENV HOME /research 11 | ENV PYENV_ROOT $HOME/.pyenv 12 | ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH 13 | RUN apt-get install -y python-setuptools 14 | RUN apt-get install -y python-pip python3-pip virtualenv htop 15 | RUN pip3 install virtualenv 16 | RUN pip3 install virtualenvwrapper 17 | RUN pip3 install --upgrade numpy scipy sklearn tf-nightly-gpu 18 | #RUN pip3 install --upgrade numpy scipy sklearn tf-nightly-gpu 19 | # Mount data into the docker 20 | ADD . /research/reinforcement 21 | WORKDIR /research/reinforcement 22 | # RUN /bin/bash env_setup.sh 23 | 24 | RUN pip3 install --upgrade pip 25 | RUN pip3 install --upgrade setuptools 26 | RUN pip3 install -r minigo/requirements.txt 27 | #RUN pip3 install "tensorflow-gpu>=1.5,<1.6" 28 | 29 | ENTRYPOINT ["/bin/bash"] 30 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/.bazelrc: -------------------------------------------------------------------------------- 1 | build --define=tf=1 2 | test -c dbg 3 | 4 | # Some of the Bazel rules used to precompile TensorFlow don't respect Bazel's 5 | # "manual" tag. The following hack prevents Bazel from compiling TensorFlow 6 | # from source when executing a command such as: bazel test cc/... 7 | test //cc/... -- -//cc/tensorflow/... 8 | 9 | # These .bazelrc files are generated by the cc/configure_tensorflow.sh script. 10 | try-import %workspace%/tf_configure.bazelrc 11 | try-import %workspace%/tensorflow.bazelrc 12 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/.gitignore: -------------------------------------------------------------------------------- 1 | lib 2 | lib64 3 | bin 4 | data 5 | *__pycache__ 6 | pip-selfcheck.json 7 | *.pyc 8 | sgf 9 | pyvenv.cfg 10 | .DS_store 11 | logs/ 12 | 13 | # Vim temp files 14 | *.swp 15 | *.swo 16 | *~ 17 | 18 | .mypy_cache 19 | 20 | # Ignore any staging directory. We use this directory for docker-file creation. 21 | staging/ 22 | 23 | bazel-* 24 | cc/tensorflow/ 25 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/minigo/tensorflow/minigo/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cc/.clang-format: -------------------------------------------------------------------------------- 1 | BasedOnStyle: Google 2 | 3 | Cpp11BracedListStyle: true 4 | DerivePointerAlignment: false 5 | PointerAlignment: Left 6 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cc/CPPLINT.cfg: -------------------------------------------------------------------------------- 1 | # Stop cpplint complaining about including 2 | filter=-build/c++11 3 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cc/benchmark.BUILD: -------------------------------------------------------------------------------- 1 | cc_library( 2 | name = "benchmark", 3 | srcs = glob( 4 | ["src/*.cc"], 5 | exclude = [ 6 | "src/re_posix.cc", 7 | "src/gnuregex.cc", 8 | ], 9 | ), 10 | hdrs = glob( 11 | [ 12 | "src/*.h", 13 | "include/benchmark/*.h", 14 | ], 15 | exclude = [ 16 | "src/re_posix.h", 17 | "src/gnuregex.h", 18 | ], 19 | ), 20 | copts = [ 21 | "-DHAVE_STD_REGEX", 22 | ], 23 | includes = [ 24 | "include", 25 | ], 26 | linkopts = ["-pthread"], 27 | visibility = ["//visibility:public"], 28 | ) 29 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cc/dual_net/test_lite.minigo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/minigo/tensorflow/minigo/cc/dual_net/test_lite.minigo -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cc/dual_net/test_model.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/minigo/tensorflow/minigo/cc/dual_net/test_model.pb -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cc/dual_net/test_model.tflite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/minigo/tensorflow/minigo/cc/dual_net/test_model.tflite -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cc/dual_net/test_model.uff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/minigo/tensorflow/minigo/cc/dual_net/test_model.uff -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cc/dual_net/test_tf.minigo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/minigo/tensorflow/minigo/cc/dual_net/test_tf.minigo -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cc/dual_net/trt_dual_net.h: -------------------------------------------------------------------------------- 1 | #ifndef CC_DUAL_NET_TRT_DUAL_NET_H_ 2 | #define CC_DUAL_NET_TRT_DUAL_NET_H_ 3 | 4 | #include "cc/dual_net/dual_net.h" 5 | 6 | namespace minigo { 7 | 8 | class TrtDualNetFactory : public DualNetFactory { 9 | public: 10 | TrtDualNetFactory(); 11 | 12 | int GetBufferCount() const override; 13 | 14 | std::unique_ptr NewDualNet(const std::string& model) override; 15 | 16 | private: 17 | int device_count_; 18 | }; 19 | 20 | } // namespace minigo 21 | 22 | #endif // CC_DUAL_NET_TRT_DUAL_NET_H_ 23 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cc/group.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "cc/group.h" 16 | 17 | namespace minigo { 18 | 19 | constexpr int Group::kMaxNumGroups; 20 | 21 | } // namespace minigo 22 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cc/init.h: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef CC_INIT_H_ 16 | #define CC_INIT_H_ 17 | 18 | namespace minigo { 19 | 20 | // Initializes base libraries like gflags and symbolizer. 21 | // Call at the very top of main. 22 | void Init(int* pargc, char*** pargv); 23 | 24 | } // namespace minigo 25 | 26 | #endif // CC_INIT_H_ 27 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cc/json.BUILD: -------------------------------------------------------------------------------- 1 | cc_library( 2 | name = "json", 3 | hdrs = ["single_include/nlohmann/json.hpp"], 4 | includes = ["single_include"], 5 | visibility = ["//visibility:public"], 6 | ) 7 | 8 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cc/model/types.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "cc/model/types.h" 16 | 17 | namespace minigo { 18 | 19 | std::ostream& operator<<(std::ostream& os, const TensorShape& shape) { 20 | os << "["; 21 | if (!shape.empty()) { 22 | os << shape[0]; 23 | for (int i = 1; i < shape.size(); ++i) { 24 | os << ", " << shape[i]; 25 | } 26 | } 27 | os << "]"; 28 | return os; 29 | } 30 | 31 | } // namespace minigo 32 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cc/move.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2018 Google LLC 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "cc/move.h" 16 | 17 | #include "absl/strings/str_cat.h" 18 | 19 | namespace minigo { 20 | 21 | std::string Move::ToSgf() const { 22 | return absl::StrCat(ColorToCode(color), "[", c.ToSgf(), "]"); 23 | } 24 | 25 | } // namespace minigo 26 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cc/platform/BUILD: -------------------------------------------------------------------------------- 1 | package(default_visibility = [ 2 | "//cc:__subpackages__", 3 | ]) 4 | 5 | licenses(["notice"]) # Apache License 2.0 6 | 7 | cc_library( 8 | name = "platform", 9 | srcs = select({ 10 | "@bazel_tools//src/conditions:darwin": ["utils_osx.cc"], 11 | "@bazel_tools//src/conditions:windows": ["utils_windows.cc"], 12 | "//conditions:default": ["utils_linux.cc"], 13 | }), 14 | hdrs = [ 15 | "utils.h", 16 | ], 17 | deps = [ 18 | "//cc:logging", 19 | ], 20 | ) 21 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cc/wtf.BUILD: -------------------------------------------------------------------------------- 1 | cc_library( 2 | name = "wtf", 3 | srcs = [ 4 | "bindings/cpp/buffer.cc", 5 | "bindings/cpp/event.cc", 6 | "bindings/cpp/platform.cc", 7 | "bindings/cpp/runtime.cc", 8 | ] + glob(["bindings/cpp/include/wtf/platform/*.h"]), 9 | hdrs = [ 10 | "bindings/cpp/include/wtf/argtypes.h", 11 | "bindings/cpp/include/wtf/config.h", 12 | "bindings/cpp/include/wtf/event.h", 13 | "bindings/cpp/include/wtf/macros.h", 14 | "bindings/cpp/include/wtf/platform.h", 15 | "bindings/cpp/include/wtf/runtime.h", 16 | "bindings/cpp/include/wtf/buffer.h", 17 | ], 18 | copts = [ 19 | "-O3", 20 | ], 21 | includes = [ 22 | "bindings/cpp/include/", 23 | ], 24 | visibility = ["//visibility:public"], 25 | ) 26 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/.gitignore: -------------------------------------------------------------------------------- 1 | *.json 2 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/base/Makefile: -------------------------------------------------------------------------------- 1 | # See existing images with 2 | # gcloud container images list-tags gcr.io/$PROJECT/cc-base 3 | # 4 | # Usage: 5 | # VERSION_TAG=0.XY make target 6 | 7 | base-image: 8 | mkdir -p staging/cc/tensorflow 9 | cp ../../WORKSPACE staging/ 10 | cp ../../.bazelrc staging/ 11 | cp ../../cc/configure_tensorflow.sh staging/cc/ 12 | cp ../../cc/tensorflow/BUILD staging/cc/tensorflow/ 13 | cp ../../cc/tensorflow/copy_outputs.sh staging/cc/tensorflow/ 14 | cp ../../requirements.txt staging/ 15 | docker build --quiet -t "gcr.io/${PROJECT}/cc-base:${VERSION_TAG}" . 16 | rm -rfd staging/ 17 | 18 | base-push: base-image 19 | gcloud docker --verbosity=error -- push "gcr.io/${PROJECT}/cc-base:${VERSION_TAG}" 20 | 21 | .PHONY: base-image base-push 22 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/calibrator/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PROJECT 2 | FROM gcr.io/$PROJECT/cc-base:latest 3 | 4 | RUN pip3 install tensorflow==1.15.0 5 | WORKDIR /app 6 | 7 | ENV BOARD_SIZE="19" 8 | 9 | COPY staging /app 10 | COPY staging/rl_loop/ /app 11 | 12 | CMD ["sh", "-c", "python rl_loop/update_resign_threshold.py --bucket_name=$BUCKET_NAME --flagfile=rl_loop/distributed_flags"] 13 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/calibrator/Makefile: -------------------------------------------------------------------------------- 1 | # See existing images with 2 | # gcloud container images list 3 | # 4 | # Usage: 5 | # PROJECT=(gcp project id) VERSION_TAG=0.XY make target 6 | 7 | define staging = 8 | mkdir -p staging 9 | cp -r -p ../../rl_loop/ staging/ 10 | cp ../../*.py staging/ 11 | cp ../../requirements.txt staging/ 12 | endef 13 | 14 | 15 | image: 16 | $(staging) 17 | docker build --quiet --build-arg PROJECT=$(PROJECT) -f Dockerfile -t "gcr.io/$(PROJECT)/minigo-calibrator:$(VERSION_TAG)" . 18 | 19 | push: image 20 | gcloud docker -- push "gcr.io/$(PROJECT)/minigo-calibrator:$(VERSION_TAG)" 21 | 22 | clean: 23 | rm -rfd staging 24 | 25 | .PHONY: image push clean 26 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/calibrator/calibrator-deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: minigo-calibrator 5 | spec: 6 | replicas: 1 7 | selector: 8 | matchLabels: 9 | app: minigo-calibrator 10 | template: 11 | metadata: 12 | labels: 13 | app: minigo-calibrator 14 | spec: 15 | containers: 16 | - name: minigo-calibrator-container 17 | image: gcr.io/tensor-go/minigo-calibrator:v17 18 | imagePullPolicy: Always 19 | volumeMounts: 20 | - name: service-credentials 21 | mountPath: /etc/credentials 22 | env: 23 | - name: GCS_READ_CACHE_MAX_SIZE_MB 24 | value: "0" 25 | - name: GOOGLE_APPLICATION_CREDENTIALS 26 | value: /etc/credentials/service-account.json 27 | - name: BUCKET_NAME 28 | value: $BUCKET_NAME 29 | volumes: 30 | - name: service-credentials 31 | secret: 32 | secretName: $SERVICE_ACCOUNT-creds 33 | restartPolicy: Always 34 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/cgos/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM gcr.io/${PROJECT}/cc-base:v14 2 | 3 | RUN apt-get update && apt-get install gettext -y 4 | 5 | COPY cgosGtp-linux-x86_64 /app 6 | 7 | ARG MODEL 8 | ARG MODEL_NUM 9 | 10 | ENV MODEL=$MODEL 11 | ENV MODEL_NUM=$MODEL_NUM 12 | 13 | WORKDIR /app 14 | 15 | RUN gsutil cp gs://minigo-pub/v7-19x19/models/$MODEL /app 16 | COPY config.txt /app 17 | 18 | # config.txt should setup the username and password for cgos 19 | # it's not checked in for obvious reasons. 20 | RUN envsubst < config.txt > config.txt 21 | 22 | RUN cat config.txt 23 | CMD ["./cgosGtp-linux-x86_64", "-c", "config.txt"] 24 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/cgos/Makefile: -------------------------------------------------------------------------------- 1 | # See existing images with 2 | # gcloud container images list-tags gcr.io/$PROJECT/cgos-player 3 | # 4 | # Usage: 5 | # Set environment variables as follows: 6 | # - MODEL: the basename of the .pb of the model to play (e.g. 000303-olympus.pb) 7 | # - MODEL_NUM: the number of the model, e.g. '303'. This is used in the image name, CGOS username, and pod name. 8 | # - VERSION_TAG: as in the other cluster/ dockerfiles. 9 | # 10 | # VERSION_TAG=0.XY MODEL=000123-foo.pb MODEL_NUM=123 make target 11 | 12 | cgos-image: 13 | docker build --build-arg MODEL=${MODEL} --build-arg MODEL_NUM=${MODEL_NUM} -t "gcr.io/${PROJECT}/cgos-player-${MODEL_NUM}:${VERSION_TAG}" . 14 | 15 | cgos-push: cgos-image 16 | gcloud docker -- push "gcr.io/${PROJECT}/cgos-player-${MODEL_NUM}:${VERSION_TAG}" 17 | 18 | .PHONY: cgos-image cgos-push 19 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/cgos/cgos-player.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: minigo-cgos-player-$MODEL_NUM 5 | spec: 6 | containers: 7 | - name: cgos-player-$MODEL_NUM 8 | image: gcr.io/$PROJECT/cgos-player:$VERSION_TAG 9 | imagePullPolicy: Always 10 | resources: 11 | limits: 12 | nvidia.com/gpu: 1 13 | requests: 14 | nvidia.com/gpu: 1 15 | volumeMounts: 16 | - name: service-credentials 17 | mountPath: /etc/credentials 18 | volumes: 19 | - name: service-credentials 20 | secret: 21 | secretName: $SERVICE_ACCOUNT-creds 22 | restartPolicy: OnFailure 23 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/cluster-down.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 17 | 18 | source ${SCRIPT_DIR}/common.sh 19 | source ${SCRIPT_DIR}/utils.sh 20 | 21 | check_gcloud_exists 22 | 23 | gcloud container clusters delete $CLUSTER_NAME --project=$PROJECT --zone=$ZONE --async 24 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/create_table.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | set -e 17 | 18 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 19 | 20 | source ${SCRIPT_DIR}/common.sh 21 | source ${SCRIPT_DIR}/utils.sh 22 | 23 | create_cbt_game_table 24 | create_cbt_eval_game_table 25 | create_cbt_model_table 26 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/destroy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # Copyright 2018 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | source ./common.sh 17 | 18 | ./cluster-down.sh 19 | 20 | # Left here for documentation, if you really wish to delete more things: 21 | # 22 | # gcloud iam service-accounts delete $SERVICE_ACCOUNT_EMAIL 23 | 24 | # gsutil -m rm -r gs://$BUCKET_NAME 25 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/evaluator/Dockerfile-cc: -------------------------------------------------------------------------------- 1 | ARG project 2 | #FROM gcr.io/$project/cc-base:v17-testing 3 | from base-build-manual2 4 | 5 | RUN export CLOUD_SDK_REPO="cloud-sdk-$(lsb_release -c -s)" && \ 6 | echo "deb http://packages.cloud.google.com/apt $CLOUD_SDK_REPO main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ 7 | curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - && \ 8 | apt-get update -y && apt-get install google-cloud-sdk -y 9 | 10 | RUN apt-get install python3 python3-pip -y 11 | RUN pip3 install absl-py 12 | 13 | COPY staging/ /app 14 | WORKDIR /app 15 | RUN bazel build -c opt --define=tf=1 --define=bt=1 cc/eval 16 | 17 | COPY evaluator_cc_wrapper.sh /app 18 | 19 | CMD ["/bin/bash", "evaluator_cc_wrapper.sh"] 20 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/evaluator/Dockerfile-ringmaster: -------------------------------------------------------------------------------- 1 | ARG project 2 | from base-build-manual2 3 | 4 | RUN export CLOUD_SDK_REPO="cloud-sdk-$(lsb_release -c -s)" && \ 5 | echo "deb http://packages.cloud.google.com/apt $CLOUD_SDK_REPO main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ 6 | curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - && \ 7 | apt-get update -y && apt-get install google-cloud-sdk -y 8 | 9 | RUN apt-get install python3 python3-pip -y 10 | # TODO(AMJ): Get this to compile, determine base & pip requirementes 11 | RUN pip3 install absl-py 12 | 13 | COPY staging/ /app 14 | WORKDIR /app 15 | 16 | COPY evaluator_ringmaster_wrapper.py /app 17 | 18 | # long series of args here. 19 | CMD ["python3", "evaluator_ringmaster_wrapper.py"] 20 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/evaluator/deploy-cc-evaluator.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 17 | 18 | source ${SCRIPT_DIR}/../common.sh 19 | source ${SCRIPT_DIR}/../utils.sh 20 | 21 | check_envsubst 22 | 23 | echo $MODEL_BLACK 24 | echo $MODEL_WHITE 25 | cat ${SCRIPT_DIR}/cc-evaluator.yaml | envsubst | kubectl apply -f - 26 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/minigui/minigui-pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: minigui 5 | spec: 6 | containers: 7 | - name: minigui 8 | image: gcr.io/$(PROJECT)/$(MINIGUI_PY_CPU_CONTAINER):$(VERSION_TAG) 9 | ports: 10 | - containerPort: 5001 11 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/minigui/simple-service.yaml: -------------------------------------------------------------------------------- 1 | kind: Service 2 | apiVersion: v1 3 | metadata: 4 | name: minigo-service 5 | spec: 6 | selector: 7 | app: minigui 8 | ports: 9 | - protocol: TCP 10 | port: 80 11 | targetPort: 5001 12 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/ringmaster/Makefile: -------------------------------------------------------------------------------- 1 | 2 | gtp: 3 | mkdir -p staging/cc 4 | cp ../../WORKSPACE staging/ 5 | cp ../../.bazelrc staging/ 6 | cp -r -p ../../cc/ staging/ 7 | cp p100-lz-tuning staging/ 8 | rm -rfd staging/cc/tensorflow 9 | docker build --build-arg PROJECT=${PROJECT} -f mggtp-Dockerfile -t "gcr.io/$(PROJECT)/mg-gtp:$(VERSION_TAG)" . 10 | 11 | ring: 12 | mkdir -p staging/cc 13 | cp ../../WORKSPACE staging/ 14 | cp ../../.bazelrc staging/ 15 | cp -r -p ../../cc/ staging/ 16 | cp p100-lz-tuning staging/ 17 | cp ringmaster_wrapper.sh staging/ 18 | rm -rfd staging/cc/tensorflow 19 | docker build --build-arg PROJECT=${PROJECT} -f lz-Dockerfile -t "gcr.io/$(PROJECT)/mg-ring:$(VERSION_TAG)" . 20 | 21 | 22 | gtp-push: gtp 23 | gcloud docker --verbosity=error -- push "gcr.io/$(PROJECT)/mg-gtp:$(VERSION_TAG)" 24 | 25 | ring-push: ring 26 | gcloud docker --verbosity=error -- push "gcr.io/$(PROJECT)/mg-ring:$(VERSION_TAG)" 27 | 28 | clean: 29 | rm -rfd staging/ 30 | .PHONY: gtp gtp-push clean 31 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/ringmaster/example.ctl: -------------------------------------------------------------------------------- 1 | competition_type = 'playoff' 2 | description = """Example control file""" 3 | 4 | board_size = 19 5 | komi = 7.5 6 | 7 | record_games = True 8 | stderr_to_log = True 9 | 10 | def LeelaPlayer(model, playouts): 11 | return Player( 12 | "./leelaz -g --noponder -w {} -t 1 -p {} --timemanage off ".format( 13 | model, playouts), 14 | startup_gtp_commands=["time_settings 0 1 0"]) 15 | 16 | matchups = [] 17 | players = {} 18 | for playouts in ["10000"]: 19 | p1 = LeelaPlayer('lz202.gz', playouts) 20 | p2 = LeelaPlayer('mg-16-833.gz', playouts) 21 | p1_name = "lz202_p{}".format(playouts) 22 | p2_name = "mg-16-833{}".format(playouts) 23 | players[p1_name] = p1 24 | players[p2_name] = p2 25 | matchup_name = "lz202_vs_mg-16-833_p{}".format(playouts) 26 | matchups.append(Matchup( 27 | p1_name, p2_name, id=matchup_name, number_of_games=2, 28 | alternating=True, scorer='players')) 29 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/ringmaster/mggtp-Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PROJECT 2 | FROM gcr.io/$PROJECT/cc-base:latest 3 | 4 | COPY staging/ /app 5 | RUN bazel build -c opt --define=tf=1 --define=gpu=1 --define=bt=1 cc/gtp 6 | 7 | ENTRYPOINT ["bazel-bin/cc/gtp"] 8 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/ringmaster/p100-lz-tuning: -------------------------------------------------------------------------------- 1 | 0;XgemmBatched;256;25;256;36; -DKWG=32 -DKWI=8 -DMDIMA=8 -DMDIMC=8 -DMWG=32 -DNDIMB=8 -DNDIMC=8 -DNWG=32 -DSA=1 -DSB=1 -DSTRM=0 -DSTRN=0 -DVWM=4 -DVWN=4;OpenCL: NVIDIA Corporation Tesla P100-PCIE-16GB @ 1328MHz 2 | 0;XgemmBatchedHalf;256;25;256;36; -DKWG=16 -DKWI=8 -DMDIMA=16 -DMDIMC=16 -DMWG=64 -DNDIMB=8 -DNDIMC=8 -DNWG=32 -DSA=1 -DSB=1 -DSTRM=0 -DSTRN=0 -DVWM=4 -DVWN=2;OpenCL: NVIDIA Corporation Tesla P100-PCIE-16GB @ 1328MHz 3 | 1;XgemmBatched;256;25;256;36; -DKWG=32 -DKWI=8 -DMDIMA=8 -DMDIMC=8 -DMWG=32 -DNDIMB=8 -DNDIMC=8 -DNWG=32 -DSA=1 -DSB=1 -DSTRM=0 -DSTRN=0 -DTCE=0 -DVWM=4 -DVWN=4;OpenCL: NVIDIA Corporation Tesla P100-PCIE-16GB @ 1328MHz 4 | 1;XgemmBatchedHalf;256;25;256;36; -DKWG=16 -DKWI=8 -DMDIMA=16 -DMDIMC=16 -DMWG=64 -DNDIMB=8 -DNDIMC=8 -DNWG=32 -DSA=1 -DSB=1 -DSTRM=0 -DSTRN=0 -DTCE=0 -DVWM=4 -DVWN=2;OpenCL: NVIDIA Corporation Tesla P100-PCIE-16GB @ 1328MHz 5 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/ringmaster/ringmaster_wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | : ${RINGMASTER_CONTROL_PATH?"Need to set RINGMASTER_CONTROL_PATH"} 6 | : ${OUT_PATH?"Need to set OUT_PATH"} 7 | : ${MODEL_ONE?"Need to set MODEL_ONE"} 8 | : ${MODEL_TWO?"Need to set MODEL_TWO"} 9 | 10 | gcloud auth activate-service-account --key-file=$GOOGLE_APPLICATION_CREDENTIALS 11 | gsutil cp $RINGMASTER_CONTROL_PATH . 12 | gsutil cp $MODEL_ONE . 13 | gsutil cp $MODEL_TWO . 14 | 15 | 16 | RING_BASENAME=`basename $RINGMASTER_CONTROL_PATH` 17 | 18 | # if your control file doesn't end in .ctl, things are bad. 19 | RING_FILES=`basename $RINGMASTER_CONTROL_PATH .ctl` 20 | 21 | date 22 | echo "Running Ringmaster: $RING_BASENAME" 23 | 24 | #/mg_venv/bin/ringmaster $RING_BASENAME check 25 | /mg_venv/bin/ringmaster $RING_BASENAME run 26 | 27 | echo "Ringmaster all done" 28 | POD_NAME=`hostname | rev | cut -d'-' -f 1 | rev` 29 | 30 | gsutil -m cp -r $RING_FILES.* $OUT_PATH/$POD_NAME/ 31 | 32 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/selfplay/Dockerfile-cc: -------------------------------------------------------------------------------- 1 | ARG PROJECT 2 | FROM gcr.io/$PROJECT/cc-base:latest 3 | 4 | WORKDIR /app 5 | # Now bring in the rest of our code; changing our code will only trigger rebuilds below here 6 | COPY staging /app 7 | COPY staging/rl_loop/ /app 8 | COPY staging/mask_flags.py /app 9 | 10 | RUN bazel build -c opt cc/selfplay --define=tf=1 --define=tpu=0 --define=bt=1 11 | CMD ["sh", "-c", "python rl_loop/selfplay.py --bucket_name=$BUCKET_NAME --mode=cc"] 12 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/selfplay/Dockerfile-tpu: -------------------------------------------------------------------------------- 1 | ARG PROJECT 2 | FROM gcr.io/$PROJECT/cc-base:latest 3 | 4 | # Tensorflow is needed for gfile 5 | RUN pip3 install tensorflow==1.15.0 6 | WORKDIR /app 7 | 8 | ARG RUNMODE 9 | 10 | ENV RUNMODE=$RUNMODE 11 | ENV BOARD_SIZE="19" 12 | 13 | COPY staging /app 14 | 15 | COPY staging/rl_loop/ /app 16 | COPY staging/mask_flags.py /app 17 | 18 | RUN bazel build -c opt cc/selfplay --define=tf=1 --define=tpu=1 --define=bt=1 19 | CMD ["sh", "-c", "python rl_loop/selfplay.py --bucket_name=$BUCKET_NAME --mode=$RUNMODE"] 20 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/selfplay/deploy-cc-player.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 17 | 18 | source ${SCRIPT_DIR}/../common.sh 19 | source ${SCRIPT_DIR}/../utils.sh 20 | 21 | check_envsubst 22 | 23 | cat ${SCRIPT_DIR}/cc-player.yaml | envsubst | kubectl apply -f - 24 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/selfplay/deploy-cpu-player.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 17 | source ${SCRIPT_DIR}/../common.sh 18 | source ${SCRIPT_DIR}/../utils.sh 19 | 20 | check_envsubst 21 | 22 | cat ${SCRIPT_DIR}/cpu-player.yaml | envsubst | kubectl apply -f - 23 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/trainer/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG PROJECT 2 | FROM gcr.io/$PROJECT/cc-base:latest 3 | 4 | RUN pip3 install tensorflow==1.15.0 5 | WORKDIR /app 6 | 7 | ENV BOARD_SIZE="19" 8 | 9 | COPY staging /app 10 | COPY staging/rl_loop/ /app 11 | COPY staging/mask_flags.py /app 12 | 13 | CMD ["sh", "-c", "python rl_loop/train_and_validate.py --bucket_name=$BUCKET_NAME --pro_dataset=$PRO_DATASET"] 14 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/trainer/Makefile: -------------------------------------------------------------------------------- 1 | # See existing images with 2 | # gcloud container images list 3 | # 4 | # Usage: 5 | # PROJECT=(gcp project id) VERSION_TAG=0.XY make target 6 | 7 | define staging = 8 | mkdir -p staging 9 | cp ../../WORKSPACE staging/ 10 | cp ../../.bazelrc staging/ 11 | cp -r -p ../../rl_loop/ staging/ 12 | cp -r -p ../../cc/ staging/ 13 | rm -rfd staging/cc/tensorflow 14 | cp ../../*.py staging/ 15 | cp ../../requirements.txt staging/ 16 | endef 17 | 18 | 19 | image: 20 | $(staging) 21 | docker build --quiet --build-arg PROJECT=$(PROJECT) -f Dockerfile -t "gcr.io/$(PROJECT)/minigo-tpu-trainer:$(VERSION_TAG)" . 22 | 23 | push: image 24 | gcloud docker --verbosity=error -- push "gcr.io/$(PROJECT)/minigo-tpu-trainer:$(VERSION_TAG)" 25 | 26 | clean: 27 | rm -rfd staging 28 | 29 | .PHONY: image push clean 30 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/cluster/trainer/deploy-trainer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2018 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 17 | 18 | source ${SCRIPT_DIR}/../common.sh 19 | source ${SCRIPT_DIR}/../utils.sh 20 | 21 | check_envsubst 22 | 23 | if [[ -z "${BUCKET_NAME}" ]]; then 24 | echo >&2 "BUCKET_NAME is not defined" 25 | return 1 26 | fi 27 | 28 | cat ${SCRIPT_DIR}/tpu-trainer-deployment.yaml | envsubst | kubectl apply -f - 29 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/minigui/control/leelaz.ctl: -------------------------------------------------------------------------------- 1 | board_size = 19 2 | 3 | players = { 4 | "leelaz" : Player("../leela-zero/build/leelaz" 5 | " --weights best-network" 6 | " --timemanage fast" 7 | " -g", 8 | startup_gtp_commands=[], 9 | cwd="../leela-zero/build"), 10 | } 11 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/minigui/control/minigo_edgetpu.ctl: -------------------------------------------------------------------------------- 1 | board_size = 19 2 | 3 | players = { 4 | "minigo_edgetpu" : Player("python" 5 | " -u" 6 | " gtp.py" 7 | " --load_file=saved_models/v17-2019-04-29-edgetpu.tflite" 8 | " --minigui_mode=true" 9 | " --num_readouts=800" 10 | " --resign_threshold=-0.8" 11 | " --parallel_readouts=1" 12 | " --verbose=2", 13 | startup_gtp_commands=[], 14 | environ={"BOARD_SIZE": str(board_size)}), 15 | } 16 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/minigui/control/minigo_py.ctl: -------------------------------------------------------------------------------- 1 | board_size = 19 2 | 3 | players = { 4 | "minigo_py" : Player("python" 5 | " -u" 6 | " gtp.py" 7 | " --load_file=saved_models/000990-cormorant" 8 | " --minigui_mode=true" 9 | " --num_readouts=64" 10 | " --conv_width=256" 11 | " --resign_threshold=-0.8" 12 | " --verbose=2", 13 | startup_gtp_commands=[], 14 | environ={"BOARD_SIZE": str(board_size)}), 15 | } 16 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/minigui/control/minigo_tf.ctl: -------------------------------------------------------------------------------- 1 | board_size = 19 2 | 3 | players = { 4 | "minigo_tf" : Player("bazel-bin/cc/gtp" 5 | " --minigui=true" 6 | " --engine=tf" 7 | " --model=saved_models/000990-cormorant.pb" 8 | " --num_readouts=64" 9 | " --value_init_penalty=0" 10 | " --courtesy_pass=true" 11 | " --virtual_losses=8" 12 | " --resign_threshold=-0.8", 13 | startup_gtp_commands=[ 14 | "report_search_interval 100", 15 | ]), 16 | } 17 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/minigui/control/vs.ctl: -------------------------------------------------------------------------------- 1 | players = { 2 | "leelaz" : Player("../leela-zero/build/leelaz" 3 | " --weights best-network" 4 | " --timemanage off -r 3" 5 | " --noponder" 6 | " -g", 7 | startup_gtp_commands=[ 8 | "time_settings 0 5 1", 9 | ], 10 | cwd="../leela-zero/build"), 11 | 12 | "minigo" : Player("bazel-bin/cc/gtp" 13 | " --minigui=true" 14 | " --model=tf,saved_models/000990-cormorant.pb" 15 | " --num_readouts=200" 16 | " --value_init_penalty=0" 17 | " --courtesy_pass=true" 18 | " --virtual_losses=8" 19 | " --resign_threshold=-0.8", 20 | startup_gtp_commands=[ 21 | "report_search_interval 100", 22 | ]), 23 | } 24 | 25 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/minigui/edgetpu/start_chromium.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2019 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # You can pass a parameter as the start URL 16 | killall chromium 17 | export DISPLAY=:0 18 | export GDK_BACKEND=x11 19 | chromium --incognito $1 & 20 | CHROMIUM_PID=$! 21 | sleep 5 22 | xte -x :0 "key F11" 23 | xte -x :0 "keydown Control_L" "key 0" "keyup Control_L" 24 | wait ${CHROMIUM_PID} 25 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/minigui/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py 2 | numpy 3 | flask 4 | flask-socketio 5 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/minigui/static/view.js: -------------------------------------------------------------------------------- 1 | define(["require", "exports"], function (require, exports) { 2 | "use strict"; 3 | Object.defineProperty(exports, "__esModule", { value: true }); 4 | class ViewPainter { 5 | constructor() { 6 | this.pendingViews = []; 7 | } 8 | draw(view) { 9 | if (this.pendingViews.length == 0) { 10 | window.requestAnimationFrame(() => { 11 | for (let view of this.pendingViews) { 12 | view.drawImpl(); 13 | } 14 | this.pendingViews = []; 15 | }); 16 | } 17 | if (this.pendingViews.indexOf(view) == -1) { 18 | this.pendingViews.push(view); 19 | } 20 | } 21 | } 22 | let painter = new ViewPainter(); 23 | class View { 24 | draw() { painter.draw(this); } 25 | } 26 | exports.View = View; 27 | }); 28 | //# sourceMappingURL=view.js.map -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/minigui/unset-minigui-common.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | unset MINIGUI_PYTHON 16 | unset MINIGUI_BUCKET_NAME 17 | unset MINIGUI_GCS_DIR 18 | unset MINIGUI_MODEL 19 | unset MINIGUI_MODEL_TMPDIR 20 | unset MINIGUI_BOARD_SIZE 21 | unset MINIGUI_PORT 22 | unset MINIGUI_HOST 23 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/ml_perf/.gitignore: -------------------------------------------------------------------------------- 1 | checkpoint/ 2 | results/ 3 | target/ 4 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/ml_perf/flags/19/architecture.flags: -------------------------------------------------------------------------------- 1 | # architecture.flags: Flags that control the model architecture. 2 | 3 | --conv_width=64 4 | --fc_width=64 5 | --trunk_layers=6 6 | --value_cost_weight=0.5 7 | --summary_steps=128 8 | 9 | --bool_features=1 10 | 11 | # --input_features=$FEATURES and --input_layout=$LAYOUT must match 12 | # --model=random:$FEATURES:$LAYOUT:0 in bootstrap.flags. 13 | --input_features=mlperf07 14 | --input_layout=nchw 15 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/ml_perf/flags/19/bootstrap.flags: -------------------------------------------------------------------------------- 1 | # bootstrap.flags 2 | # Flags for the first bootstrap round of selfplay. 3 | 4 | --flagfile=ml_perf/flags/19/selfplay.flags 5 | 6 | --num_readouts=80 7 | --fastplay_frequency=0 8 | 9 | --holdout_pct=0 10 | --device=0 11 | --cache_size_mb=0 12 | 13 | --output_threads=4 14 | --selfplay_threads=48 15 | --parallel_search=1 16 | --parallel_inference=48 17 | --concurrent_games_per_thread=64 18 | 19 | --min_resign_threshold=-1.00 20 | --max_resign_threshold=-0.99 21 | 22 | --allow_pass=0 23 | --target_pruning=0 24 | --restrict_pass_alive_play_threshold=4 25 | 26 | # --model=random:$FEATURES:$LAYOUT:0 must match --input_features=$FEATURES and 27 | # --input_layout=$LAYOUT in architecture.flags. 28 | --model=random:mlperf07:nchw:0 29 | 30 | --num_games=8192 31 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/ml_perf/flags/19/eval.flags: -------------------------------------------------------------------------------- 1 | # eval.flags: Flags for playing eval games. 2 | 3 | --flagfile=ml_perf/flags/19/selfplay.flags 4 | 5 | --value_init_penalty=0.2 6 | --num_readouts=100 7 | --fastplay_frequency=0 8 | --resign_enabled=false 9 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/ml_perf/flags/19/rl_loop.flags: -------------------------------------------------------------------------------- 1 | --flags_dir=ml_perf/flags/19/ 2 | --checkpoint_dir=ml_perf/checkpoint/19/ 3 | 4 | --iterations=200 5 | --gating_win_rate=0.49 6 | --window_size=5 7 | 8 | --train_devices=0 9 | --eval_device=1 10 | --selfplay_devices=0,1,2,3,4,5,6,7 11 | 12 | --train_filter=0.3 13 | 14 | --bootstrap_target_win_rate=0.05 15 | 16 | --eval_num_games=100 17 | 18 | --validate=1 19 | 20 | # Consider also updating num_games in bootstrap.flags if updating 21 | # min_games_per_iteration. 22 | --min_games_per_iteration=8192 23 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/ml_perf/flags/19/selfplay.flags: -------------------------------------------------------------------------------- 1 | # selfplay.flags: Flags for selfplay. 2 | 3 | # This flagfile also serves as the base for the boostrap & eval stages of 4 | # the RL loop. 5 | 6 | --num_readouts=800 7 | --fastplay_frequency=0.75 8 | --fastplay_readouts=80 9 | --value_init_penalty=0.2 10 | --holdout_pct=0.0 11 | --disable_resign_pct=0.0 12 | --min_resign_threshold=-1.0 13 | --max_resign_threshold=-0.9 14 | --virtual_losses=4 15 | 16 | --dirichlet_alpha=0.03 17 | --noise_mix=0.3 18 | 19 | --cache_size_mb=8192 20 | --verbose=false 21 | 22 | --selfplay_threads=3 23 | --parallel_search=4 24 | --parallel_inference=2 25 | --concurrent_games_per_thread=32 26 | 27 | --target_pruning=1 28 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/ml_perf/flags/19/train.flags: -------------------------------------------------------------------------------- 1 | # train.flags: Flags for training. 2 | 3 | --flagfile=ml_perf/flags/19/architecture.flags 4 | 5 | --shuffle_buffer_size=0 6 | --shuffle_examples=false 7 | 8 | --keep_checkpoint_max=100 9 | 10 | # Device specific hyperparameters re: batch size and LR schedules. 11 | --train_batch_size=4096 12 | 13 | --lr_rates=0.016 14 | --lr_rates=0.16 15 | --lr_rates=0.016 16 | --lr_rates=0.0016 17 | 18 | --lr_boundaries=128 19 | --lr_boundaries=10000 20 | --lr_boundaries=20000 21 | --l2_strength=0.0001 22 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/ml_perf/flags/19/train_loop.flags: -------------------------------------------------------------------------------- 1 | --iterations=100 2 | 3 | --window_size=5 4 | --train_filter=0.3 5 | 6 | --validate=0 7 | 8 | # --num_games in bootstrap.flags must be >= --min_games_per_iteration. 9 | --min_games_per_iteration=8192 10 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/ml_perf/flags/19/validate.flags: -------------------------------------------------------------------------------- 1 | # validate.flags Flags for validation. 2 | 3 | --flagfile=ml_perf/flags/19/architecture.flags 4 | 5 | --examples_to_validate=512 6 | --train_batch_size=64 7 | --summary_steps=8 8 | --l2_strength=0.0001 9 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/ml_perf/flags/9/architecture.flags: -------------------------------------------------------------------------------- 1 | # architecture.flags: Flags that control the model architecture. 2 | 3 | --conv_width=64 4 | --fc_width=64 5 | --trunk_layers=9 6 | --value_cost_weight=0.5 7 | --summary_steps=64 8 | 9 | --bool_features=1 10 | 11 | # --input_features=$FEATURES and --input_layout=$LAYOUT must match 12 | # --model=random:$FEATURES:$LAYOUT:0 in bootstrap.flags. 13 | --input_features=mlperf07 14 | --input_layout=nhwc 15 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/ml_perf/flags/9/bootstrap.flags: -------------------------------------------------------------------------------- 1 | # bootstrap.flags 2 | # Flags for the first bootstrap round of selfplay. 3 | 4 | --flagfile=ml_perf/flags/9/selfplay.flags 5 | 6 | --num_readouts=80 7 | --fastplay_frequency=0 8 | 9 | --holdout_pct=0 10 | --device=0 11 | --cache_size_mb=0 12 | 13 | --output_threads=4 14 | --selfplay_threads=48 15 | --parallel_search=1 16 | --parallel_inference=48 17 | --concurrent_games_per_thread=64 18 | 19 | --min_resign_threshold=-1.00 20 | --max_resign_threshold=-0.99 21 | 22 | --allow_pass=0 23 | --target_pruning=0 24 | --restrict_pass_alive_play_threshold=4 25 | 26 | # --model=random:$FEATURES:$LAYOUT:0 must match --input_features=$FEATURES and 27 | # --input_layout=$LAYOUT in architecture.flags. 28 | --model=random:mlperf07:nhwc:0 29 | 30 | --num_games=4096 31 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/ml_perf/flags/9/eval.flags: -------------------------------------------------------------------------------- 1 | # eval.flags: Flags for playing eval games. 2 | 3 | --flagfile=ml_perf/flags/9/selfplay.flags 4 | 5 | # --num_readouts=240 6 | --fastplay_frequency=0 7 | --resign_enabled=false 8 | 9 | --parallel_games=100 10 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/ml_perf/flags/9/rl_loop.flags: -------------------------------------------------------------------------------- 1 | --flags_dir=ml_perf/flags/9/ 2 | --checkpoint_dir=ml_perf/checkpoint/9/ 3 | 4 | --iterations=50 5 | --gating_win_rate=0.49 6 | --window_size=5 7 | 8 | --train_devices=0 9 | --eval_device=1 10 | --selfplay_devices=0,1,2,3,4,5,6,7 11 | 12 | --train_filter=0.3 13 | 14 | --bootstrap_target_win_rate=0.05 15 | 16 | --eval_num_games=100 17 | 18 | --validate=1 19 | 20 | # Consider also updating num_games in bootstrap.flags if updating 21 | # min_games_per_iteration. 22 | --min_games_per_iteration=4096 23 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/ml_perf/flags/9/selfplay.flags: -------------------------------------------------------------------------------- 1 | # selfplay.flags: Flags for selfplay. 2 | 3 | # This flagfile also serves as the base for the boostrap & eval stages of 4 | # the RL loop. 5 | 6 | --num_readouts=240 7 | --fastplay_frequency=0 8 | --value_init_penalty=0.2 9 | --holdout_pct=0.025 10 | --disable_resign_pct=1.0 11 | --min_resign_threshold=-1.0 12 | --max_resign_threshold=-0.8 13 | --virtual_losses=2 14 | 15 | --dirichlet_alpha=0.135 16 | --noise_mix=0.3 17 | 18 | --cache_size_mb=2048 19 | --verbose=false 20 | 21 | --selfplay_threads=3 22 | --parallel_search=2 23 | --parallel_inference=3 24 | --concurrent_games_per_thread=128 25 | 26 | --target_pruning=1 27 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/ml_perf/flags/9/train.flags: -------------------------------------------------------------------------------- 1 | # train.flags: Flags for training. 2 | 3 | --flagfile=ml_perf/flags/9/architecture.flags 4 | 5 | --shuffle_buffer_size=0 6 | --shuffle_examples=false 7 | 8 | --keep_checkpoint_max=100 9 | 10 | # see --train_filter in rl_loop.flags 11 | # --filter_amount=1 12 | 13 | # Device specific hyperparameters re: batch size and LR schedules. 14 | --train_batch_size=4096 15 | --lr_rates=0.16 16 | --lr_rates=0.016 17 | --lr_rates=0.0016 18 | --lr_boundaries=25000 19 | --lr_boundaries=37500 20 | --l2_strength=0.0001 21 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/ml_perf/flags/9/train_loop.flags: -------------------------------------------------------------------------------- 1 | --iterations=50 2 | 3 | --window_size=5 4 | --train_filter=0.3 5 | 6 | --validate=1 7 | 8 | # Consider also updating num_games in bootstrap.flags if updating 9 | --min_games_per_iteration=4096 10 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/ml_perf/flags/9/validate.flags: -------------------------------------------------------------------------------- 1 | # validate.flags Flags for validation. 2 | 3 | --flagfile=ml_perf/flags/9/architecture.flags 4 | 5 | --examples_to_validate=512 6 | --train_batch_size=64 7 | --summary_steps=8 8 | --l2_strength=0.0001 9 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/ml_perf/scripts/stop_selfplay.sh: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Stops selfplay processes by creating an "abort file" at a location monitored 16 | # by the selfplay processes. 17 | # This script is called automatically by train.sh on exit. 18 | 19 | 20 | source ml_perf/scripts/common.sh 21 | 22 | 23 | touch "${abort_file}" 24 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/oneoffs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/minigo/tensorflow/minigo/oneoffs/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/requirements-analysis.txt: -------------------------------------------------------------------------------- 1 | matplotlib 2 | pandas 3 | choix 4 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/requirements-colab.txt: -------------------------------------------------------------------------------- 1 | # Copy of requirements.txt with packages in colab commented. 2 | 3 | #absl-py 4 | autopep8>=1.3 5 | fire 6 | google.cloud.logging 7 | google.cloud.bigtable 8 | #grpcio-tools 9 | #keras 10 | #numpy>=1.14.0 11 | #protobuf 12 | pylint 13 | sgf==0.5 14 | #six 15 | #tqdm>=4.17 16 | 17 | #oauth2client==4.1 18 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py 2 | autopep8>=1.3 3 | choix>=0.3.3 4 | fire 5 | google.cloud.logging 6 | google.cloud.bigtable 7 | grpcio-tools 8 | keras 9 | numpy>=1.14.0 10 | protobuf 11 | pylint 12 | sgf==0.5 13 | six 14 | tqdm>=4.17 15 | pyasn1>=0.4.1 16 | setuptools>=34.0.0 17 | 18 | oauth2client==4.1 19 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/rl_loop/distributed_flags_nr: -------------------------------------------------------------------------------- 1 | # this file is the subset of flags used by the set of clusters that will do 2 | # selfplay with resignation disabled (AKA 'calibration' games). 3 | # 4 | # Network architecture flags 5 | --conv_width=256 6 | --fc_width=256 7 | --trunk_layers=19 8 | --use_SE 9 | --use_SE_bias 10 | 11 | # Selfplay related flags. 12 | # These flags can be overwritten by --flags_path (see cc/main.cc for details) 13 | --run_forever=true 14 | --inject_noise=true 15 | --soft_pick=true 16 | --random_symmetry=true 17 | --virtual_losses=8 18 | --parallel_games=8 19 | --num_readouts=800 20 | --disable_resign_pct=1.00 21 | --resign_threshold=-1.0 22 | --value_init_penalty=2.0 23 | --output_bigtable=tensor-go,minigo-instance,v17-games-nr 24 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/rl_loop/local_flags: -------------------------------------------------------------------------------- 1 | # This file is an enumeration of all hyperparameters required by the 2 | # AGZ pipeline. The flags in this file get passed into all of the scripts 3 | # in the rl_loop/ directory via the mask_flags.py helper library. 4 | 5 | --conv_width=8 6 | --fc_width=16 7 | --trunk_layers=1 8 | --train_batch_size=16 9 | --shuffle_buffer_size=1000 10 | --num_evaluation_games=1 11 | --verbose=0 12 | --num_readouts=10 13 | --value_init_penalty=2.0 14 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/minigo/tensorflow/minigo/tests/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/tests/test_flags: -------------------------------------------------------------------------------- 1 | # One line per flag 2 | # For example --someflag=value 3 | --conv_width=8 4 | --fc_width=2 5 | --trunk_layers=1 6 | --cbt_project=foo 7 | --cbt_instance=bar 8 | --cbt_table=baz 9 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/minigo/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ESNEXT", 4 | "module": "system", 5 | "noImplicitAny": true, 6 | "noImplicitThis": true, 7 | "strictNullChecks": true, 8 | "removeComments": true, 9 | "preserveConstEnums": true, 10 | "outDir": "minigui/static/", 11 | "sourceMap": true, 12 | "module": "amd" 13 | }, 14 | "include": [ 15 | "minigui/*.ts" 16 | ], 17 | "exclude": [ 18 | ] 19 | } 20 | 21 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script should be only executed in docker. 3 | # Run minigo... stop when it converges. 4 | set -e 5 | 6 | SEED=$1 7 | mkdir -p /research/results/minigo/final/ 8 | cd /research/reinforcement/minigo 9 | bash loop_main.sh params/final.json $SEED 10 | -------------------------------------------------------------------------------- /retired_benchmarks/minigo/tensorflow/run_and_time.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | # runs benchmark and reports time to convergence 3 | # to use the script: 4 | # run_and_time.sh 5 | 6 | 7 | set -e 8 | 9 | # start timing 10 | start=$(date +%s) 11 | start_fmt=$(date +%Y-%m-%d\ %r) 12 | echo "STARTING TIMING RUN AT $start_fmt" 13 | 14 | 15 | # run benchmark 16 | 17 | seed=${1:-1} 18 | 19 | echo "running benchmark with seed $seed" 20 | # The termination quality is set in params/final.json. See RAEDME.md. 21 | ./run.sh $seed 22 | sleep 3 23 | ret_code=$?; if [[ $ret_code != 0 ]]; then exit $ret_code; fi 24 | 25 | 26 | # end timing 27 | end=$(date +%s) 28 | end_fmt=$(date +%Y-%m-%d\ %r) 29 | echo "ENDING TIMING RUN AT $end_fmt" 30 | 31 | 32 | # report result 33 | result=$(( $end - $start )) 34 | result_name="reinforcement" 35 | 36 | 37 | echo "RESULT,$result_name,$seed,$result,$USER,$start_fmt" 38 | -------------------------------------------------------------------------------- /retired_benchmarks/mixtral8x22b/config/dataset/c4_mlperf.yaml: -------------------------------------------------------------------------------- 1 | dataset_name: c4_mlperf 2 | train_dataset_path: gs://mlperf-llm-public2/c4/en_json/3.0.1 3 | eval_dataset_path: gs://mlperf-llm-public2/c4/en_val_subset_json 4 | streaming: True 5 | 6 | # num of process in data processing 7 | num_proc: 1 8 | 9 | # whether to load dataset from cache 10 | load_from_cache_file: True 11 | 12 | shuffle_buffer_size: 256 -------------------------------------------------------------------------------- /retired_benchmarks/mixtral8x22b/config/dataset/wikitext.yaml: -------------------------------------------------------------------------------- 1 | dataset_name: wikitext 2 | dataset_config_name: wikitext-2-raw-v1 3 | streaming: False 4 | 5 | # num of process in data processing 6 | num_proc: 1 7 | 8 | # whether to load dataset from cache 9 | load_from_cache_file: True 10 | 11 | shuffle_buffer_size: 256 -------------------------------------------------------------------------------- /retired_benchmarks/mixtral8x22b/config/model/blank_model.yaml: -------------------------------------------------------------------------------- 1 | config_path: null 2 | name_or_path: mistralai/Mixtral-8x7B-v0.1 3 | dtype: bfloat16 4 | flash_attention: True 5 | capacity_factor: 0 # dropped implementation with a positive number 6 | max_sequence_length: ${max_length} 7 | 8 | fsdp_config: 9 | fsdp_transformer_layer_cls_to_wrap: ["MixtralDecoderLayer"] 10 | min_num_params: 0 11 | xla_fsdp_grad_ckpt: true 12 | -------------------------------------------------------------------------------- /retired_benchmarks/mixtral8x22b/config/sched/CosineAnnealing.yaml: -------------------------------------------------------------------------------- 1 | name: CosineAnnealing 2 | warmup_ratio: 0.25 3 | # warmup_steps: 150 4 | min_lr: ${multiply:0.1,${lr}} 5 | max_steps: ${max_steps} 6 | -------------------------------------------------------------------------------- /retired_benchmarks/mixtral8x22b/config/sched/WarmupHoldPolicy.yaml: -------------------------------------------------------------------------------- 1 | name: WarmupHoldPolicy 2 | warmup_ratio: 0.25 3 | # warmup_steps: 150 4 | hold_steps: 10000000000000 # Incredibly large value to hold the lr as constant 5 | max_steps: ${max_steps} 6 | -------------------------------------------------------------------------------- /retired_benchmarks/mixtral8x22b/docker/gpu/build_and_push_image.sh: -------------------------------------------------------------------------------- 1 | set -euox pipefail 2 | SCRIPTS_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" && pwd )" 3 | DATE=$(date +%Y%m%d) 4 | : ${PROJECT_ID:=cloud-tpu-multipod-dev} 5 | : ${IMAGE:=gcr.io/${PROJECT_ID}/${USER}-pytorch-nemo-moe-${DATE}} 6 | : ${DOCKER_BUILD_ARGS:=""} 7 | 8 | pushd ${SCRIPTS_DIR} 9 | 10 | docker build --network host \ 11 | --file Dockerfile \ 12 | --tag ${IMAGE}-base \ 13 | ${DOCKER_BUILD_ARGS} \ 14 | . 15 | 16 | docker build --network host \ 17 | --file Dockerfile.GCP \ 18 | --tag ${IMAGE} \ 19 | --build-arg FROM_BASE_IMAGE=${IMAGE}-base \ 20 | . 21 | 22 | popd 23 | 24 | docker push ${IMAGE} 25 | -------------------------------------------------------------------------------- /retired_benchmarks/mixtral8x22b/docker/tpu/build_and_push_image.sh: -------------------------------------------------------------------------------- 1 | set -euox pipefail 2 | SCRIPTS_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" && pwd )" 3 | DATE=$(date +%Y%m%d) 4 | : ${PROJECT_ID:=cloud-tpu-multipod-dev} 5 | : ${IMAGE:=gcr.io/${PROJECT_ID}/${USER}-pytorch-xla-moe-${DATE}} 6 | : ${DOCKER_BUILD_ARGS:=""} 7 | 8 | pushd ${SCRIPTS_DIR} 9 | 10 | docker build --network host \ 11 | --file Dockerfile \ 12 | --tag ${IMAGE} \ 13 | ${DOCKER_BUILD_ARGS} \ 14 | . 15 | popd 16 | 17 | docker push ${IMAGE} 18 | -------------------------------------------------------------------------------- /retired_benchmarks/mixtral8x22b/download_dataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2024 Google LLC 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | https://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import sys 18 | from huggingface_hub import snapshot_download 19 | 20 | snapshot_download(repo_id=sys.argv[1], local_dir=sys.argv[2]) 21 | -------------------------------------------------------------------------------- /retired_benchmarks/mixtral8x22b/helm_context/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: megatron_moe_benchmark 3 | description: megatron_moe_benchmark 4 | type: application 5 | version: 0.1.0 6 | appVersion: "1.16.0" -------------------------------------------------------------------------------- /retired_benchmarks/mixtral8x22b/mixtral80.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "MixtralForCausalLM" 4 | ], 5 | "attention_dropout": 0.0, 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": 4096, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 14336, 12 | "max_position_embeddings": 32768, 13 | "model_type": "mixtral", 14 | "num_attention_heads": 32, 15 | "num_experts_per_tok": 2, 16 | "num_hidden_layers": 1, 17 | "num_key_value_heads": 8, 18 | "num_local_experts": 8, 19 | "output_router_logits": false, 20 | "rms_norm_eps": 1e-05, 21 | "rope_theta": 1000000.0, 22 | "router_aux_loss_coef": 0.02, 23 | "sliding_window": null, 24 | "tie_word_embeddings": false, 25 | "torch_dtype": "bfloat16", 26 | "transformers_version": "4.36.0.dev0", 27 | "use_cache": true, 28 | "vocab_size": 32000 29 | } 30 | 31 | -------------------------------------------------------------------------------- /retired_benchmarks/mixtral8x22b/mixtral822-instruct.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "MixtralForCausalLM" 4 | ], 5 | "attention_dropout": 0.0, 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": 6144, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 16384, 12 | "max_position_embeddings": 65536, 13 | "model_type": "mixtral", 14 | "num_attention_heads": 48, 15 | "num_experts_per_tok": 2, 16 | "num_hidden_layers": 56, 17 | "num_key_value_heads": 8, 18 | "num_local_experts": 8, 19 | "output_router_logits": false, 20 | "rms_norm_eps": 1e-05, 21 | "rope_theta": 1000000, 22 | "router_aux_loss_coef": 0.001, 23 | "sliding_window": null, 24 | "tie_word_embeddings": false, 25 | "torch_dtype": "bfloat16", 26 | "transformers_version": "4.38.0", 27 | "use_cache": true, 28 | "vocab_size": 32768 29 | } 30 | 31 | -------------------------------------------------------------------------------- /retired_benchmarks/mixtral8x22b/mixtral822.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "MixtralForCausalLM" 4 | ], 5 | "attention_dropout": 0.0, 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": 6144, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 16384, 12 | "max_position_embeddings": 65536, 13 | "model_type": "mixtral", 14 | "num_attention_heads": 48, 15 | "num_experts_per_tok": 2, 16 | "num_hidden_layers": 56, 17 | "num_key_value_heads": 8, 18 | "num_local_experts": 8, 19 | "output_router_logits": false, 20 | "rms_norm_eps": 1e-05, 21 | "rope_theta": 1000000, 22 | "router_aux_loss_coef": 0.001, 23 | "sliding_window": null, 24 | "tie_word_embeddings": false, 25 | "torch_dtype": "bfloat16", 26 | "transformers_version": "4.38.0", 27 | "use_cache": true, 28 | "vocab_size": 32000 29 | } 30 | 31 | -------------------------------------------------------------------------------- /retired_benchmarks/mixtral8x22b/mixtral87.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "MixtralForCausalLM" 4 | ], 5 | "attention_dropout": 0.0, 6 | "bos_token_id": 1, 7 | "eos_token_id": 2, 8 | "hidden_act": "silu", 9 | "hidden_size": 4096, 10 | "initializer_range": 0.02, 11 | "intermediate_size": 14336, 12 | "max_position_embeddings": 32768, 13 | "model_type": "mixtral", 14 | "num_attention_heads": 32, 15 | "num_experts_per_tok": 2, 16 | "num_hidden_layers": 32, 17 | "num_key_value_heads": 8, 18 | "num_local_experts": 8, 19 | "output_router_logits": false, 20 | "rms_norm_eps": 1e-05, 21 | "rope_theta": 1000000.0, 22 | "router_aux_loss_coef": 0.02, 23 | "sliding_window": null, 24 | "tie_word_embeddings": false, 25 | "torch_dtype": "bfloat16", 26 | "transformers_version": "4.36.0.dev0", 27 | "use_cache": true, 28 | "vocab_size": 32000 29 | } 30 | -------------------------------------------------------------------------------- /retired_benchmarks/ncf/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | -------------------------------------------------------------------------------- /retired_benchmarks/ncf/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG FROM_IMAGE_NAME=pytorch/pytorch:1.0.1-cuda10.0-cudnn7-runtime 2 | FROM ${FROM_IMAGE_NAME} 3 | 4 | # Install Python dependencies 5 | WORKDIR /workspace/recommendation 6 | 7 | COPY requirements.txt . 8 | RUN pip install -r requirements.txt 9 | 10 | COPY negative_sampling_cpp ./negative_sampling_cpp 11 | WORKDIR /workspace/recommendation/negative_sampling_cpp 12 | RUN python setup.py install 13 | 14 | # Copy NCF code and build 15 | WORKDIR /workspace/recommendation 16 | COPY . . 17 | -------------------------------------------------------------------------------- /retired_benchmarks/ncf/negative_sampling_cpp/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from torch.utils.cpp_extension import CppExtension, BuildExtension 3 | 4 | setup(name='negative_sampling', 5 | ext_modules=[CppExtension('negative_sampling', ['negative_sampling.cpp'])], 6 | cmdclass={'build_ext': BuildExtension}) 7 | -------------------------------------------------------------------------------- /retired_benchmarks/ncf/negative_sampling_cpp/test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import negative_sampling 3 | 4 | n_positives = 1000 5 | n_users = 10 6 | n_items = 500 7 | users = torch.randint(size=[n_positives, 1], low=0, high=n_users) 8 | items = torch.randint(size=[n_positives, 1], low=0, high=n_items) 9 | 10 | positives = torch.cat([users, items], dim=1) 11 | positives, _ = torch.sort(positives, dim=1) 12 | positives, _ = torch.sort(positives, dim=0) 13 | 14 | print("positives: ", positives) 15 | 16 | 17 | sampler = negative_sampling.NegativeSampler(positives, n_users, n_items) 18 | train_negatives = sampler.generate_train(4) 19 | test_negatives = sampler.generate_test(20) 20 | 21 | print(train_negatives) 22 | print(test_negatives) 23 | 24 | -------------------------------------------------------------------------------- /retired_benchmarks/ncf/requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm==4.20.0 2 | scipy 3 | torch 4 | numpy 5 | numpy_indexed 6 | pandas 7 | mlperf_compliance==0.0.10 8 | -------------------------------------------------------------------------------- /retired_benchmarks/never-adopted/sentiment_analysis/download_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Use the python script to download the IMDB dataset 4 | python download.py 5 | -------------------------------------------------------------------------------- /retired_benchmarks/never-adopted/sentiment_analysis/paddle/run_and_time.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Start timing 4 | start_time=$(date +%s) 5 | start_fmt=$(date +%Y-%m-%d\ %r) 6 | echo "STARTING TIMING RUN AT $start_fmt" 7 | 8 | seed=$1 9 | echo "Running sentiment benchmark with seed $seed" 10 | 11 | # Train a sentiment_analysis model (default: conv model), with a user 12 | # specified seed 13 | python train.py -s ${seed} 14 | 15 | # End timing 16 | end_time=$(date +%s) 17 | end_fmt=$(date +%Y-%m-%d\ %r) 18 | echo "ENDING TIMING RUN AT $end_fmt" 19 | 20 | # Report result 21 | result=$(( ${end_time} - ${start_time} )) 22 | result_name="sentiment" 23 | 24 | echo "RESULT,$result_name,$seed,$result,$USER,$start_fmt" 25 | -------------------------------------------------------------------------------- /retired_benchmarks/never-adopted/sentiment_analysis/verify.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import hashlib 4 | import os 5 | 6 | URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz' 7 | MD5 = '7c2ac02c03563afcf9b574c7e56c153a' 8 | DIR = os.path.expanduser('~/.cache/paddle/dataset/imdb') 9 | PATH = os.path.join(DIR, URL.split('/')[-1]) 10 | CHUNK_SIZE = 4096 11 | 12 | def md5content(fname): 13 | hash_md5 = hashlib.md5() 14 | f = open(fname, "rb") 15 | for chunk in iter(lambda: f.read(CHUNK_SIZE), b""): 16 | hash_md5.update(chunk) 17 | f.close() 18 | return hash_md5.hexdigest() 19 | 20 | # Verify MD5 checksum 21 | def verify(): 22 | if md5content(PATH) == MD5: 23 | print("PASSED!") 24 | else: 25 | print("FAILED") 26 | 27 | if __name__ == "__main__": 28 | verify() 29 | -------------------------------------------------------------------------------- /retired_benchmarks/never-adopted/sentiment_analysis/verify_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Use the python script to verify the MD5 checksum 4 | # of the downloaded dataset. 5 | python verify.py 6 | -------------------------------------------------------------------------------- /retired_benchmarks/never-adopted/speech_recognition/__init__.py: -------------------------------------------------------------------------------- 1 | import data 2 | -------------------------------------------------------------------------------- /retired_benchmarks/never-adopted/speech_recognition/data/.gitignore: -------------------------------------------------------------------------------- 1 | an4_dataset/ 2 | -------------------------------------------------------------------------------- /retired_benchmarks/never-adopted/speech_recognition/data/__init__.py: -------------------------------------------------------------------------------- 1 | from . import data_loader 2 | -------------------------------------------------------------------------------- /retired_benchmarks/never-adopted/speech_recognition/data/data-LibriSpeech-ref-cksum.out: -------------------------------------------------------------------------------- 1 | 2730530160 113699829760 data-LibriSpeech-ref.tar 2 | -------------------------------------------------------------------------------- /retired_benchmarks/never-adopted/speech_recognition/download_dataset.sh: -------------------------------------------------------------------------------- 1 | # Script to download Librispeech Dataset 2 | #for script testing, only fetch minimal, clean dataset 3 | 4 | python data/librispeech.py #--files_to_use train-clean-100.tar.gz,dev-clean.tar.gz,test-clean.tar.gz 5 | -------------------------------------------------------------------------------- /retired_benchmarks/never-adopted/speech_recognition/labels.json: -------------------------------------------------------------------------------- 1 | [ 2 | "_", 3 | "'", 4 | "A", 5 | "B", 6 | "C", 7 | "D", 8 | "E", 9 | "F", 10 | "G", 11 | "H", 12 | "I", 13 | "J", 14 | "K", 15 | "L", 16 | "M", 17 | "N", 18 | "O", 19 | "P", 20 | "Q", 21 | "R", 22 | "S", 23 | "T", 24 | "U", 25 | "V", 26 | "W", 27 | "X", 28 | "Y", 29 | "Z", 30 | " " 31 | ] -------------------------------------------------------------------------------- /retired_benchmarks/never-adopted/speech_recognition/pytorch/.gitignore: -------------------------------------------------------------------------------- 1 | logs/ 2 | models/ 3 | -------------------------------------------------------------------------------- /retired_benchmarks/never-adopted/speech_recognition/pytorch/docker/base.gpu: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04 2 | 3 | WORKDIR /tmp 4 | 5 | # Generic python installations 6 | # PyTorch Audio for DeepSpeech: https://github.com/SeanNaren/deepspeech.pytorch/releases 7 | # Development environment installations 8 | RUN apt-get update && apt-get install -y \ 9 | apt-utils \ 10 | python \ 11 | python-pip \ 12 | sox \ 13 | libsox-dev \ 14 | libsox-fmt-all \ 15 | git \ 16 | cmake \ 17 | tree \ 18 | htop \ 19 | bmon \ 20 | iotop \ 21 | tmux \ 22 | vim \ 23 | g++ 24 | 25 | RUN which g++ 26 | 27 | -------------------------------------------------------------------------------- /retired_benchmarks/never-adopted/speech_recognition/pytorch/docker/build-docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | nvidia-docker build . --rm -f Dockerfile.gpu -t ds2-cuda9cudnn7:gpu 4 | -------------------------------------------------------------------------------- /retired_benchmarks/never-adopted/speech_recognition/pytorch/docker/run-dev.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | nvidia-docker run \ 3 | -v /mnt/disk/mnt_dir:/mnt/disk/mnt_dir:rw \ 4 | -v /etc/passwd:/etc/passwd:ro \ 5 | -it --rm --user $(id -u) ds2-cuda9cudnn7:gpu 6 | -------------------------------------------------------------------------------- /retired_benchmarks/never-adopted/speech_recognition/pytorch/run_and_time.sh: -------------------------------------------------------------------------------- 1 | # Script to train and time DeepSpeech 2 implementation 2 | 3 | RANDOM_SEED=1 4 | TARGET_ACC=23 5 | 6 | python train.py --model_path models/deepspeech_t$RANDOM_SEED.pth.tar --seed $RANDOM_SEED --acc $TARGET_ACC 7 | -------------------------------------------------------------------------------- /retired_benchmarks/never-adopted/speech_recognition/verify_dataset.sh: -------------------------------------------------------------------------------- 1 | # Script to verify the dataset 2 | 3 | #generate tar, this takes a few minutes 4 | tar -cf data-LibriSpeech-ref.tar LibriSpeech_dataset 5 | 6 | #generate checksum on tar, this takes a few minutes 7 | cksum data-LibriSpeech-ref.tar > data-LibriSpeech-cksum.out 8 | 9 | #check against ref checksum and report success/failure 10 | cmp --silent data-LibriSpeech-cksum.out data/data-LibriSpeech-ref-cksum.out && echo 'Dataset Checksum Passed.' || echo 'WARNING: Dataset Checksum Failed.' 11 | 12 | #remove generated checksum and tar 13 | rm data-LibriSpeech-ref.tar 14 | rm data-LibriSpeech-cksum.out 15 | -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04 2 | 3 | 4 | WORKDIR /research 5 | 6 | RUN apt-get update 7 | 8 | RUN apt-get update && apt-get install -y --no-install-recommends \ 9 | ca-certificates \ 10 | build-essential \ 11 | git \ 12 | python \ 13 | python-pip 14 | 15 | 16 | ENV HOME /research 17 | ENV PYENV_ROOT $HOME/.pyenv 18 | ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH 19 | 20 | 21 | RUN apt-get install -y python-setuptools 22 | 23 | RUN apt-get install -y python-pip python3-pip virtualenv htop 24 | RUN pip3 install --upgrade numpy scipy sklearn tf-nightly-gpu 25 | 26 | 27 | # Mount data into the docker 28 | ADD . /research/resnet 29 | 30 | 31 | WORKDIR /research/resnet 32 | RUN pip3 install -r official/requirements.txt 33 | 34 | 35 | ENTRYPOINT ["/bin/bash"] 36 | 37 | -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/README.md: -------------------------------------------------------------------------------- 1 | Install 2 | ========== 3 | 4 | In order to run this, you must first set stuff up... for now see Transformer's README. 5 | 6 | 7 | Downlaoding Data 8 | ========== 9 | 10 | Downloading data is TBD. 11 | 12 | 13 | Processing Data 14 | ============= 15 | 16 | TBD. 17 | 18 | 19 | Running the Benchmark 20 | ============ 21 | 22 | You first must build the docker file; 23 | 24 | docker build . 25 | 26 | 27 | Remember the image name/number. 28 | 29 | 30 | 1. Make sure /imn on the host contains the pre-processed data. (Scripts for this TODO). 31 | 2. Choose your random seed (below we use 77) 32 | 3. Enter your docker's image name (below we use 5ca81979cbc2 which you don't have) 33 | 34 | Then, executute the following: 35 | 36 | sudo docker run -v /imn:/imn --runtime=nvidia -t -i 5ca81979cbc2 "./run_and_time.sh" 77 | tee benchmark.log 37 | 38 | 39 | -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/.gitignore: -------------------------------------------------------------------------------- 1 | MNIST-data 2 | labels.txt 3 | -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | # Docker image for running examples in Tensorflow models. 2 | # base_image depends on whether we are running on GPUs or non-GPUs 3 | FROM ubuntu:latest 4 | 5 | RUN apt-get update && apt-get install -y --no-install-recommends \ 6 | ca-certificates \ 7 | build-essential \ 8 | git \ 9 | python \ 10 | python-pip \ 11 | python-setuptools 12 | 13 | RUN pip install tf-nightly 14 | 15 | # Checkout tensorflow/models at HEAD 16 | RUN git clone https://github.com/tensorflow/models.git /tensorflow_models 17 | 18 | -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | # Docker image for running examples in Tensorflow models. 2 | # base_image depends on whether we are running on GPUs or non-GPUs 3 | FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04 4 | 5 | RUN apt-get update && apt-get install -y --no-install-recommends \ 6 | ca-certificates \ 7 | build-essential \ 8 | git \ 9 | python \ 10 | python-pip \ 11 | python-setuptools 12 | 13 | RUN pip install tf-nightly-gpu 14 | 15 | # Checkout tensorflow/models at HEAD 16 | RUN git clone https://github.com/tensorflow/models.git /tensorflow_models 17 | 18 | 19 | -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/requirements.txt: -------------------------------------------------------------------------------- 1 | psutil>=5.4.3 2 | py-cpuinfo>=3.3.0 3 | google-cloud-bigquery>=0.31.0 -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/resnet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/resnet/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/arg_parsers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/arg_parsers/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/export/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/export/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/logs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/logs/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/misc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/misc/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/reference_data_test/dense/expected_graph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/reference_data_test/dense/expected_graph -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/reference_data_test/dense/model.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/reference_data_test/dense/model.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/reference_data_test/dense/model.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/reference_data_test/dense/model.ckpt.index -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/reference_data_test/dense/results.json: -------------------------------------------------------------------------------- 1 | [1, 1, 0.4701630473136902, 0.4701630473136902, 0.4701630473136902] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/reference_data_test/dense/tf_version.json: -------------------------------------------------------------------------------- 1 | ["1.8.0-dev20180325", "v1.7.0-rc1-750-g6c1737e6c8"] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/reference_data_test/uniform_random/expected_graph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/reference_data_test/uniform_random/expected_graph -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/reference_data_test/uniform_random/model.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- 1 | ʼ|? -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/reference_data_test/uniform_random/model.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/reference_data_test/uniform_random/model.ckpt.index -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/reference_data_test/uniform_random/results.json: -------------------------------------------------------------------------------- 1 | [0.9872556924819946] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/reference_data_test/uniform_random/tf_version.json: -------------------------------------------------------------------------------- 1 | ["1.8.0-dev20180325", "v1.7.0-rc1-750-g6c1737e6c8"] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_projection_version-1_width-8_channels-4/expected_graph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_projection_version-1_width-8_channels-4/expected_graph -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_projection_version-1_width-8_channels-4/model.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_projection_version-1_width-8_channels-4/model.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_projection_version-1_width-8_channels-4/model.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_projection_version-1_width-8_channels-4/model.ckpt.index -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_projection_version-1_width-8_channels-4/results.json: -------------------------------------------------------------------------------- 1 | [32, 8, 8, 4, 0.08920872211456299, 0.8918969631195068, 4064.7060546875, 32, 4, 4, 8, 0.0, 0.10715862363576889, 2344.4775390625] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_projection_version-1_width-8_channels-4/tf_version.json: -------------------------------------------------------------------------------- 1 | ["1.8.0-dev20180408", "v1.7.0-1345-gb874783ccd"] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_projection_version-2_width-8_channels-4/expected_graph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_projection_version-2_width-8_channels-4/expected_graph -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_projection_version-2_width-8_channels-4/model.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_projection_version-2_width-8_channels-4/model.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_projection_version-2_width-8_channels-4/model.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_projection_version-2_width-8_channels-4/model.ckpt.index -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_projection_version-2_width-8_channels-4/results.json: -------------------------------------------------------------------------------- 1 | [32, 8, 8, 4, 0.918815016746521, 0.1826801300048828, 4064.4677734375, 32, 4, 4, 8, -1.3153012990951538, 0.011247094720602036, 261.84716796875] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_projection_version-2_width-8_channels-4/tf_version.json: -------------------------------------------------------------------------------- 1 | ["1.8.0-dev20180408", "v1.7.0-1345-gb874783ccd"] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_version-1_width-8_channels-4/expected_graph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_version-1_width-8_channels-4/expected_graph -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_version-1_width-8_channels-4/model.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_version-1_width-8_channels-4/model.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_version-1_width-8_channels-4/model.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_version-1_width-8_channels-4/model.ckpt.index -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_version-1_width-8_channels-4/results.json: -------------------------------------------------------------------------------- 1 | [32, 8, 8, 4, 0.1677999496459961, 0.7767924070358276, 4089.44189453125, 32, 8, 8, 4, 0.8615571856498718, 1.1359407901763916, 5806.876953125] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_version-1_width-8_channels-4/tf_version.json: -------------------------------------------------------------------------------- 1 | ["1.8.0-dev20180408", "v1.7.0-1345-gb874783ccd"] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_version-2_width-8_channels-4/expected_graph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_version-2_width-8_channels-4/expected_graph -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_version-2_width-8_channels-4/model.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_version-2_width-8_channels-4/model.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_version-2_width-8_channels-4/model.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_version-2_width-8_channels-4/model.ckpt.index -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_version-2_width-8_channels-4/results.json: -------------------------------------------------------------------------------- 1 | [32, 8, 8, 4, 0.8239736557006836, 0.3485994338989258, 4108.87548828125, 32, 8, 8, 4, 0.16798323392868042, -0.2975311279296875, 2860.068359375] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_bottleneck_version-2_width-8_channels-4/tf_version.json: -------------------------------------------------------------------------------- 1 | ["1.8.0-dev20180408", "v1.7.0-1345-gb874783ccd"] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_projection_version-1_width-8_channels-4/expected_graph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_projection_version-1_width-8_channels-4/expected_graph -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_projection_version-1_width-8_channels-4/model.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_projection_version-1_width-8_channels-4/model.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_projection_version-1_width-8_channels-4/model.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_projection_version-1_width-8_channels-4/model.ckpt.index -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_projection_version-1_width-8_channels-4/results.json: -------------------------------------------------------------------------------- 1 | [32, 8, 8, 4, 0.5349493026733398, 0.5126370191574097, 4070.01220703125, 32, 4, 4, 8, 0.0, 2.7680201530456543, 2341.23486328125] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_projection_version-1_width-8_channels-4/tf_version.json: -------------------------------------------------------------------------------- 1 | ["1.8.0-dev20180408", "v1.7.0-1345-gb874783ccd"] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_projection_version-2_width-8_channels-4/expected_graph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_projection_version-2_width-8_channels-4/expected_graph -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_projection_version-2_width-8_channels-4/model.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_projection_version-2_width-8_channels-4/model.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_projection_version-2_width-8_channels-4/model.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_projection_version-2_width-8_channels-4/model.ckpt.index -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_projection_version-2_width-8_channels-4/results.json: -------------------------------------------------------------------------------- 1 | [32, 8, 8, 4, 0.7820245027542114, 0.8173515796661377, 4095.256591796875, 32, 4, 4, 8, 0.0679062008857727, 0.009305447340011597, -137.36178588867188] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_projection_version-2_width-8_channels-4/tf_version.json: -------------------------------------------------------------------------------- 1 | ["1.8.0-dev20180408", "v1.7.0-1345-gb874783ccd"] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_version-1_width-8_channels-4/expected_graph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_version-1_width-8_channels-4/expected_graph -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_version-1_width-8_channels-4/model.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_version-1_width-8_channels-4/model.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_version-1_width-8_channels-4/model.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_version-1_width-8_channels-4/model.ckpt.index -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_version-1_width-8_channels-4/results.json: -------------------------------------------------------------------------------- 1 | [32, 8, 8, 4, 0.23128163814544678, 0.22117376327514648, 4100.51806640625, 32, 8, 8, 4, 1.1768392324447632, 0.2728465795516968, 5832.6416015625] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_version-1_width-8_channels-4/tf_version.json: -------------------------------------------------------------------------------- 1 | ["1.8.0-dev20180408", "v1.7.0-1345-gb874783ccd"] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_version-2_width-8_channels-4/expected_graph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_version-2_width-8_channels-4/expected_graph -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_version-2_width-8_channels-4/model.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_version-2_width-8_channels-4/model.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_version-2_width-8_channels-4/model.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_version-2_width-8_channels-4/model.ckpt.index -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_version-2_width-8_channels-4/results.json: -------------------------------------------------------------------------------- 1 | [32, 8, 8, 4, 0.7616699934005737, 0.5485763549804688, 4106.8720703125, 32, 8, 8, 4, -0.056346118450164795, 0.5792689919471741, 2972.37255859375] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch-size-32_building_version-2_width-8_channels-4/tf_version.json: -------------------------------------------------------------------------------- 1 | ["1.8.0-dev20180408", "v1.7.0-1345-gb874783ccd"] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch_norm/expected_graph: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch_norm/expected_graph -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch_norm/model.ckpt.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch_norm/model.ckpt.data-00000-of-00001 -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch_norm/model.ckpt.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch_norm/model.ckpt.index -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch_norm/results.json: -------------------------------------------------------------------------------- 1 | [32, 16, 16, 3, 0.9722558259963989, 0.18413543701171875, 12374.20703125, 32, 16, 16, 3, 1.6126631498336792, -1.096894383430481, -0.041595458984375] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/official/utils/testing/reference_data/resnet/batch_norm/tf_version.json: -------------------------------------------------------------------------------- 1 | ["1.8.0-dev20180408", "v1.7.0-1345-gb874783ccd"] -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/preprocess.sh: -------------------------------------------------------------------------------- 1 | # TODO 2 | -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/requirements.txt: -------------------------------------------------------------------------------- 1 | psutil>=5.4.3 2 | py-cpuinfo>=3.3.0 3 | google-cloud-bigquery>=0.31.0 4 | mlperf_compliance==0.0.6 5 | -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf1/run.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | RANDOM_SEED=$1 4 | QUALITY=$2 5 | set -e 6 | 7 | # Register the model as a source root 8 | export PYTHONPATH="$(pwd):${PYTHONPATH}" 9 | 10 | MODEL_DIR="/tmp/resnet_imagenet_${RANDOM_SEED}" 11 | 12 | python3 official/resnet/imagenet_main.py $RANDOM_SEED --data_dir /imn/imagenet/combined/ \ 13 | --model_dir $MODEL_DIR --train_epochs 10000 --stop_threshold $QUALITY --batch_size 64 \ 14 | --version 1 --resnet_size 50 --epochs_between_evals 4 15 | 16 | # To run on 8xV100s, instead run: 17 | #python3 official/resnet/imagenet_main.py $RANDOM_SEED --data_dir /imn/imagenet/combined/ \ 18 | # --model_dir $MODEL_DIR --train_epochs 10000 --stop_threshold $QUALITY --batch_size 1024 \ 19 | # --version 1 --resnet_size 50 --dtype fp16 --num_gpus 8 \ 20 | # --epochs_between_evals 4 21 | -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf2/download_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # TODO 4 | -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf2/tensorflow2/tf2_common/utils/flags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf2/tensorflow2/tf2_common/utils/flags/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf2/tensorflow2/tf2_common/utils/logs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/resnet-tf2/tensorflow2/tf2_common/utils/logs/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf2/tensorflow2/tf2_common/utils/mlp_log/__init__.py: -------------------------------------------------------------------------------- 1 | """MLP Logging.""" 2 | -------------------------------------------------------------------------------- /retired_benchmarks/resnet-tf2/verify_dataset.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # TODO 4 | -------------------------------------------------------------------------------- /retired_benchmarks/rnnt/pytorch/.dockerignore: -------------------------------------------------------------------------------- 1 | checkpoints/ 2 | tb_*/ 3 | results/ 4 | __pycache__ 5 | _legacy/ 6 | lightning_logs/ 7 | -------------------------------------------------------------------------------- /retired_benchmarks/rnnt/pytorch/NOTICE: -------------------------------------------------------------------------------- 1 | RNN-T in PyTorch 2 | 3 | This repository includes source code (in "rnnt/") from: 4 | * https://github.com/keithito/tacotron and https://github.com/ryanleary/patter licensed under MIT license. 5 | 6 | -------------------------------------------------------------------------------- /retired_benchmarks/rnnt/pytorch/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/rnnt/pytorch/common/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/rnnt/pytorch/common/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .helpers import * 2 | -------------------------------------------------------------------------------- /retired_benchmarks/rnnt/pytorch/common/data/dali/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /retired_benchmarks/rnnt/pytorch/common/data/helpers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from common.data.dali.data_loader import DaliDataLoader 16 | 17 | 18 | def dataset_size(dataset): 19 | if isinstance(dataset, DaliDataLoader): # DALI 20 | return dataset.dataset_size 21 | else: # PyTorch 22 | return dataset.sampler.num_samples 23 | -------------------------------------------------------------------------------- /retired_benchmarks/rnnt/pytorch/common/text/symbols.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017 Keith Ito 2 | """ from https://github.com/keithito/tacotron """ 3 | 4 | ''' 5 | Defines the set of symbols used in text input to the model. 6 | 7 | The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' 8 | from . import cmudict 9 | 10 | _pad = '_' 11 | _punctuation = '!\'(),.:;? ' 12 | _special = '-' 13 | _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' 14 | 15 | # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): 16 | _arpabet = ['@' + s for s in cmudict.valid_symbols] 17 | 18 | # Export all symbols: 19 | symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet 20 | -------------------------------------------------------------------------------- /retired_benchmarks/rnnt/pytorch/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.3' 2 | services: 3 | test: 4 | deploy: 5 | resources: 6 | reservations: 7 | devices: 8 | - capabilities: 9 | - gpu 10 | build: 11 | context: . 12 | dockerfile: tests/Dockerfile 13 | volumes: 14 | - .:/code 15 | - /mnt/mwawrzos/storage/datasets/LibriSpeech/LibriSpeech:/datasets/LibriSpeech 16 | stdin_open: true 17 | tty: true 18 | -------------------------------------------------------------------------------- /retired_benchmarks/rnnt/pytorch/mlperf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/rnnt/pytorch/mlperf/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/rnnt/pytorch/requirements.txt: -------------------------------------------------------------------------------- 1 | https://github.com/NVIDIA/dllogger/archive/26a0f8f1958de2c0c460925ff6102a4d2486d6cc.zip 2 | https://github.com/mlcommons/logging/archive/d08740cadb4188a5ebeb84ad6c68f98c1e129805.zip 3 | tensorboard==2.3.0 4 | unidecode==1.1.1 5 | inflect==4.1.0 6 | soundfile==0.10.3.post1 7 | librosa==0.8.0 8 | sox==1.4.1 9 | sentencepiece==0.1.94 10 | pandas==1.1.5 11 | -------------------------------------------------------------------------------- /retired_benchmarks/rnnt/pytorch/scripts/docker/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | docker build . --rm -t mlperf/rnn_speech_recognition 4 | -------------------------------------------------------------------------------- /retired_benchmarks/rnnt/pytorch/scripts/inference_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | set -a 18 | 19 | : ${CUDNN_BENCHMARK:=true} 20 | : ${MAX_DURATION:=36} 21 | : ${PAD_TO_MAX_DURATION:=true} 22 | 23 | bash ./scripts/inference.sh "$@" 24 | -------------------------------------------------------------------------------- /retired_benchmarks/rnnt/pytorch/tests/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/pytorch:20.10-py3 2 | 3 | COPY tests/requirements.txt . 4 | RUN pip install --upgrade pip && \ 5 | pip install -r requirements.txt 6 | 7 | COPY requirements.txt . 8 | RUN pip install -r requirements.txt 9 | 10 | 11 | 12 | WORKDIR /code 13 | 14 | CMD bash 15 | -------------------------------------------------------------------------------- /retired_benchmarks/rnnt/pytorch/tests/requirements.txt: -------------------------------------------------------------------------------- 1 | pytest==6.1.2 2 | -------------------------------------------------------------------------------- /retired_benchmarks/rnnt/pytorch/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/rnnt/pytorch/utils/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/rnnt/pytorch/utils/inference_librispeech.csv: -------------------------------------------------------------------------------- 1 | url,md5 2 | http://www.openslr.org/resources/12/dev-clean.tar.gz,42e2234ba48799c1f50f24a7926300a1 3 | http://www.openslr.org/resources/12/dev-other.tar.gz,c8d0bcc9cca99d4f8b62fcc847357931 4 | http://www.openslr.org/resources/12/test-clean.tar.gz,32fa31d27d2e1cad72775fee3f4849a9 5 | http://www.openslr.org/resources/12/test-other.tar.gz,fb5a50374b501bb3bac4815ee91d3135 6 | -------------------------------------------------------------------------------- /retired_benchmarks/rnnt/pytorch/utils/librispeech.csv: -------------------------------------------------------------------------------- 1 | url,md5 2 | http://www.openslr.org/resources/12/dev-clean.tar.gz,42e2234ba48799c1f50f24a7926300a1 3 | http://www.openslr.org/resources/12/dev-other.tar.gz,c8d0bcc9cca99d4f8b62fcc847357931 4 | http://www.openslr.org/resources/12/test-clean.tar.gz,32fa31d27d2e1cad72775fee3f4849a9 5 | http://www.openslr.org/resources/12/test-other.tar.gz,fb5a50374b501bb3bac4815ee91d3135 6 | http://www.openslr.org/resources/12/train-clean-100.tar.gz,2a93770f6d5c6c964bc36631d331a522 7 | http://www.openslr.org/resources/12/train-clean-360.tar.gz,c0e676e450a7ff2f54aeade5171606fa 8 | http://www.openslr.org/resources/12/train-other-500.tar.gz,d1a0fd59409feb2c614ce4d30c387708 9 | -------------------------------------------------------------------------------- /retired_benchmarks/ssd-v1/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.6.0-cuda10.1-cudnn7-runtime 2 | 3 | # Set working directory 4 | WORKDIR /mlperf/ssd 5 | 6 | # Necessary zone info for tzdata 7 | ENV TZ=America/New_York 8 | RUN ln -fs /usr/share/zoneinfo/America/New_York /etc/localtime 9 | 10 | # Install system dependencies 11 | RUN apt-get update && \ 12 | apt-get install -y python3-tk python-pip numactl git 13 | 14 | RUN pip install --upgrade pip 15 | 16 | # Necessary pip packages 17 | COPY requirements.txt /requirements.txt 18 | 19 | RUN pip install --no-cache-dir cython \ 20 | && pip install --no-cache-dir https://github.com/mlperf/logging/archive/9ea0afa.zip \ 21 | && pip install --no-cache-dir -r /requirements.txt 22 | 23 | # Copy SSD code 24 | COPY ssd . 25 | -------------------------------------------------------------------------------- /retired_benchmarks/ssd-v1/download_dataset.sh: -------------------------------------------------------------------------------- 1 | # Get COCO 2017 data sets 2 | dir=$(pwd) 3 | mkdir /coco; cd /coco 4 | curl -O http://images.cocodataset.org/zips/train2017.zip; unzip train2017.zip 5 | curl -O http://images.cocodataset.org/zips/val2017.zip; unzip val2017.zip 6 | curl -O http://images.cocodataset.org/annotations/annotations_trainval2017.zip; unzip annotations_trainval2017.zip 7 | cd $dir 8 | -------------------------------------------------------------------------------- /retired_benchmarks/ssd-v1/download_resnet34_backbone.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd ssd/ 4 | curl -O https://download.pytorch.org/models/resnet34-333f7ec4.pth 5 | 6 | -------------------------------------------------------------------------------- /retired_benchmarks/ssd-v1/requirements.txt: -------------------------------------------------------------------------------- 1 | Cython==0.28.4 2 | git+git://github.com/NVIDIA/apex.git@9041a868a1a253172d94b113a963375b9badd030#egg=apex 3 | mlperf-compliance==0.0.10 4 | cycler==0.10.0 5 | kiwisolver==1.0.1 6 | matplotlib==2.2.2 7 | numpy==1.19.1 8 | Pillow==5.2.0 9 | pyparsing==2.2.0 10 | python-dateutil==2.7.3 11 | pytz==2018.5 12 | six==1.11.0 13 | torchvision==0.2.1 14 | pycocotools==2.0.2 15 | -------------------------------------------------------------------------------- /retired_benchmarks/ssd-v1/ssd/config_DGX1_32.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## DL params 4 | EXTRA_PARAMS=( 5 | --batch-size "32" 6 | ) 7 | 8 | ## System run parms 9 | DGXNNODES=1 10 | DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) 11 | WALLTIME=24:00:00 12 | 13 | ## System config params 14 | DGXNGPU=1 15 | DGXSOCKETCORES=4 16 | DGXNSOCKET=1 17 | DGXHT=1 # HT is on is 2, HT off is 1 18 | DGXIBDEVICES='' 19 | -------------------------------------------------------------------------------- /retired_benchmarks/ssd-v1/ssd/config_DGX1_singlenode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## DL params 4 | EXTRA_PARAMS=( 5 | --batch-size "128" 6 | --warmup "2.619685" # 300 iterations * 8 GPUs * 1 nodes * 128 batch size / 117266 non-empty images 7 | ) 8 | 9 | ## System run parms 10 | DGXNNODES=1 11 | DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) 12 | WALLTIME=12:00:00 13 | 14 | ## System config params 15 | DGXNGPU=8 16 | DGXSOCKETCORES=20 17 | DGXNSOCKET=1 18 | DGXHT=1 # HT is on is 2, HT off is 1 19 | DGXIBDEVICES='' 20 | -------------------------------------------------------------------------------- /retired_benchmarks/transformer/download_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | wget https://raw.githubusercontent.com/tensorflow/models/master/official/transformer/test_data/newstest2014.en -O tensorflow/newstest2014.en 4 | wget https://raw.githubusercontent.com/tensorflow/models/master/official/transformer/test_data/newstest2014.de -O tensorflow/newstest2014.de 5 | 6 | python3 data_download.py --raw_dir raw_data 7 | -------------------------------------------------------------------------------- /retired_benchmarks/transformer/tensorflow/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:9.0-cudnn7-runtime-ubuntu16.04 2 | WORKDIR /research 3 | RUN apt-get update 4 | RUN apt-get update && apt-get install -y --no-install-recommends \ 5 | ca-certificates \ 6 | build-essential \ 7 | git \ 8 | python \ 9 | python-pip 10 | ENV HOME /research 11 | ENV PYENV_ROOT $HOME/.pyenv 12 | ENV PATH $PYENV_ROOT/shims:$PYENV_ROOT/bin:$PATH 13 | RUN apt-get install -y python-setuptools 14 | RUN apt-get install -y python-pip python3-pip virtualenv htop 15 | RUN pip3 install --upgrade numpy scipy sklearn tensorflow-gpu==1.9.0 16 | 17 | ENV LANG C.UTF-8 18 | ENV LC_ALL C.UTF-8 19 | 20 | # Mount data into the docker 21 | ADD . /research/transformer 22 | WORKDIR /research/transformer 23 | RUN pip3 install -r requirements.txt 24 | ENTRYPOINT ["/bin/bash"] 25 | 26 | -------------------------------------------------------------------------------- /retired_benchmarks/transformer/tensorflow/bert/README.md: -------------------------------------------------------------------------------- 1 | README 2 | -------------------------------------------------------------------------------- /retired_benchmarks/transformer/tensorflow/bert/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | -------------------------------------------------------------------------------- /retired_benchmarks/transformer/tensorflow/requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.2.0 2 | astor==0.6.2 3 | bleach==1.5.0 4 | cachetools==2.0.1 5 | certifi==2018.4.16 6 | chardet==3.0.4 7 | gast==0.2.0 8 | google-api-core==1.1.1 9 | google-auth==1.4.1 10 | google-cloud-bigquery==1.1.0 11 | google-cloud-core==0.28.1 12 | google-resumable-media==0.3.1 13 | googleapis-common-protos==1.5.3 14 | grpcio==1.11.0 15 | html5lib==0.9999999 16 | idna==2.6 17 | Markdown==2.6.11 18 | numpy==1.14.2 19 | protobuf==3.6.0 20 | psutil==5.4.5 21 | py-cpuinfo==4.0.0 22 | pyasn1==0.4.2 23 | pyasn1-modules==0.2.1 24 | pytz==2018.4 25 | requests==2.18.4 26 | rsa==3.4.2 27 | scikit-learn==0.19.1 28 | scipy==1.0.1 29 | six==1.11.0 30 | sklearn==0.0 31 | tb-nightly==1.8.0a20180420 32 | termcolor==1.1.0 33 | urllib3==1.22 34 | virtualenv==15.0.1 35 | Werkzeug==0.14.1 36 | -------------------------------------------------------------------------------- /retired_benchmarks/transformer/tensorflow/run_preprocessing.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | SEED=$1 6 | 7 | cd /research/transformer 8 | 9 | # TODO: Add SEED to process_data.py since this uses a random generator (future PR) 10 | export PYTHONPATH=/research/transformer/transformer:${PYTHONPATH} 11 | # Add compliance to PYTHONPATH 12 | # export PYTHONPATH=/mlperf/training/compliance:${PYTHONPATH} 13 | 14 | python3 process_data.py --raw_dir /raw_data/ --data_dir processed_data 15 | -------------------------------------------------------------------------------- /retired_benchmarks/transformer/tensorflow/run_training.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | SEED=$1 6 | QUALITY=$2 7 | 8 | cd /research/transformer 9 | 10 | export PYTHONPATH=/research/transformer/transformer:${PYTHONPATH} 11 | # Add compliance to PYTHONPATH 12 | # export PYTHONPATH=/mlperf/training/compliance:${PYTHONPATH} 13 | 14 | python3 transformer/transformer_main.py --random_seed=${SEED} --data_dir=processed_data/ --model_dir=model --params=big --bleu_threshold ${QUALITY} --bleu_source=newstest2014.en --bleu_ref=newstest2014.de 15 | -------------------------------------------------------------------------------- /retired_benchmarks/transformer/tensorflow/transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/transformer/tensorflow/transformer/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/transformer/tensorflow/transformer/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/transformer/tensorflow/transformer/model/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/transformer/tensorflow/transformer/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/retired_benchmarks/transformer/tensorflow/transformer/utils/__init__.py -------------------------------------------------------------------------------- /retired_benchmarks/unet3d/pytorch/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG FROM_IMAGE_NAME=pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime 2 | #ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.02-py3 3 | FROM ${FROM_IMAGE_NAME} 4 | 5 | ADD . /workspace/unet3d 6 | WORKDIR /workspace/unet3d 7 | 8 | RUN apt-get update && \ 9 | apt-get upgrade -y && \ 10 | apt-get install -y git 11 | RUN apt-get install -y vim 12 | 13 | RUN pip install --upgrade pip 14 | RUN pip install --disable-pip-version-check -r requirements.txt 15 | 16 | #RUN pip uninstall -y apex; pip uninstall -y apex; git clone --branch seryilmaz/fused_dropout_softmax https://github.com/seryilmaz/apex.git; cd apex; pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--xentropy" --global-option="--deprecated_fused_adam" --global-option="--deprecated_fused_lamb" --global-option="--fast_multihead_attn" . 17 | -------------------------------------------------------------------------------- /retired_benchmarks/unet3d/pytorch/evaluation_cases.txt: -------------------------------------------------------------------------------- 1 | 00000 2 | 00003 3 | 00005 4 | 00006 5 | 00012 6 | 00024 7 | 00034 8 | 00041 9 | 00044 10 | 00049 11 | 00052 12 | 00056 13 | 00061 14 | 00065 15 | 00066 16 | 00070 17 | 00076 18 | 00078 19 | 00080 20 | 00084 21 | 00086 22 | 00087 23 | 00092 24 | 00111 25 | 00112 26 | 00125 27 | 00128 28 | 00138 29 | 00157 30 | 00160 31 | 00161 32 | 00162 33 | 00169 34 | 00171 35 | 00176 36 | 00185 37 | 00187 38 | 00189 39 | 00198 40 | 00203 41 | 00206 42 | 00207 -------------------------------------------------------------------------------- /retired_benchmarks/unet3d/pytorch/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/NVIDIA/dllogger 2 | https://github.com/mlcommons/logging/archive/refs/tags/1.1.0-rc4.zip 3 | nibabel==3.2.1 4 | scipy==1.5.2 -------------------------------------------------------------------------------- /single_stage_detector/.dockerignore: -------------------------------------------------------------------------------- 1 | mlcube/workspace/ -------------------------------------------------------------------------------- /single_stage_detector/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-image>=0.15.0 2 | ujson>=4.0.2 3 | matplotlib>=3.5.1 4 | pycocotools>=2.0.4 5 | git+https://github.com/mlcommons/logging.git@1.1.0-rc4 6 | fiftyone==0.15.1 7 | -------------------------------------------------------------------------------- /single_stage_detector/scripts/download_backbone.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | DOWNLOAD_LINK='https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth' 4 | SHA512='15c9f0bc1c8d64750712f86ffaded3b0bc6a87e77a395dcda3013d8af65b7ebf3ca1c24dd3aae60c0d83e510b4d27731f0526b6f9392c0a85ffc18e5fecd8a13' 5 | FILENAME='resnext50_32x4d-7cdf4587.pth' 6 | FOLDER_PATH="./" 7 | 8 | # Handle MLCube parameters 9 | while [ $# -gt 0 ]; do 10 | case "$1" in 11 | --model_dir=*) 12 | FOLDER_PATH="${1#*=}" 13 | ;; 14 | *) 15 | esac 16 | shift 17 | done 18 | 19 | wget -c $DOWNLOAD_LINK -P $FOLDER_PATH 20 | echo "${SHA512} ${FOLDER_PATH}/${FILENAME}" | sha512sum -c 21 | -------------------------------------------------------------------------------- /single_stage_detector/scripts/download_openimages_demo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | : "${DATASET_PATH:=/datasets/open-images-v6-mlperf}" 4 | 5 | while [ "$1" != "" ]; do 6 | case $1 in 7 | -d | --dataset-path) 8 | shift 9 | DATASET_PATH=$1 10 | ;; 11 | --data_dir=*) 12 | if [[ "$PWD" = /workspace/single_stage_detector/ssd ]]; then 13 | cd ../scripts 14 | DATASET_PATH="${1#*=}" 15 | fi 16 | ;; 17 | esac 18 | shift 19 | done 20 | 21 | echo "saving to" 22 | echo $DATASET_PATH 23 | ls $DATASET_PATH 24 | 25 | MLPERF_CLASSES=('Apple' 'Banana') 26 | 27 | python fiftyone_openimages.py \ 28 | --dataset-dir=${DATASET_PATH} \ 29 | --output-labels="openimages-mlperf.json" \ 30 | --classes "${MLPERF_CLASSES[@]}" 31 | -------------------------------------------------------------------------------- /single_stage_detector/scripts/download_openimages_full.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | : "${DATASET_PATH:=/datasets/open-images-v6}" 4 | 5 | while [ "$1" != "" ]; do 6 | case $1 in 7 | -d | --dataset-path ) shift 8 | DATASET_PATH=$1 9 | ;; 10 | esac 11 | shift 12 | done 13 | 14 | python fiftyone_openimages.py \ 15 | --dataset-dir=${DATASET_PATH} 16 | -------------------------------------------------------------------------------- /single_stage_detector/ssd/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Installer logs 7 | pip-log.txt 8 | pip-delete-this-directory.txt 9 | 10 | # Unit test / coverage reports 11 | htmlcov/ 12 | .tox/ 13 | .nox/ 14 | .coverage 15 | .coverage.* 16 | .cache 17 | nosetests.xml 18 | coverage.xml 19 | *.cover 20 | .hypothesis/ 21 | .pytest_cache/ 22 | 23 | # IPython 24 | profile_default/ 25 | ipython_config.py 26 | 27 | # Environments 28 | .env 29 | .venv 30 | env/ 31 | venv/ 32 | ENV/ 33 | env.bak/ 34 | venv.bak/ 35 | 36 | # IDEs 37 | .idea/ 38 | .vscode/ 39 | 40 | # binary files 41 | *.pth 42 | *.pickle 43 | *.onnx 44 | 45 | # Misc 46 | torch-model-cache/ 47 | nogit/ 48 | TODO 49 | *.log 50 | 51 | -------------------------------------------------------------------------------- /single_stage_detector/ssd/config_DGXA100_001x08x032.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ## DL params 4 | export BATCHSIZE=32 5 | export NUMEPOCHS=${NUMEPOCHS:-8} 6 | export DATASET_DIR="/datasets/open-images-v6-mlperf" 7 | export EXTRA_PARAMS='--lr 0.0001 --output-dir=/results' 8 | 9 | ## System run parms 10 | export DGXNNODES=1 11 | export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) 12 | export WALLTIME=08:00:00 13 | 14 | ## System config params 15 | export DGXNGPU=8 16 | export DGXSOCKETCORES=64 17 | export DGXNSOCKET=2 18 | export DGXHT=2 # HT is on is 2, HT off is 1 19 | -------------------------------------------------------------------------------- /single_stage_detector/ssd/config_DGXA100_002x08x016.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ## DL params 4 | export BATCHSIZE=16 5 | export NUMEPOCHS=${NUMEPOCHS:-8} 6 | export DATASET_DIR="/datasets/open-images-v6-mlperf" 7 | export EXTRA_PARAMS='--lr 0.0001 --output-dir=/results' 8 | 9 | ## System run parms 10 | export DGXNNODES=2 11 | export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) 12 | export WALLTIME=04:00:00 13 | 14 | ## System config params 15 | export DGXNGPU=8 16 | export DGXSOCKETCORES=64 17 | export DGXNSOCKET=2 18 | export DGXHT=2 # HT is on is 2, HT off is 1 19 | -------------------------------------------------------------------------------- /single_stage_detector/ssd/config_DGXA100_008x08x004_inference_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ## DL params 4 | export BATCHSIZE=4 5 | export NUMEPOCHS=${NUMEPOCHS:-15} 6 | export DATASET_DIR="/datasets/open-images-v6-mlperf" 7 | export EXTRA_PARAMS='--lr 0.0001 --output-dir=/results --target-map 0.99' 8 | 9 | ## System run parms 10 | export DGXNNODES=8 11 | export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) 12 | export WALLTIME=04:00:00 13 | 14 | ## System config params 15 | export DGXNGPU=8 16 | export DGXSOCKETCORES=64 17 | export DGXNSOCKET=2 18 | export DGXHT=2 # HT is on is 2, HT off is 1 19 | -------------------------------------------------------------------------------- /single_stage_detector/ssd/config_DGXA100_008x08x008.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ## DL params 4 | export BATCHSIZE=8 5 | export NUMEPOCHS=${NUMEPOCHS:-8} 6 | export DATASET_DIR="/datasets/open-images-v6-mlperf" 7 | export EXTRA_PARAMS='--lr 0.0001 --output-dir=/results' 8 | 9 | ## System run parms 10 | export DGXNNODES=8 11 | export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) 12 | export WALLTIME=04:00:00 13 | 14 | ## System config params 15 | export DGXNGPU=8 16 | export DGXSOCKETCORES=64 17 | export DGXNSOCKET=2 18 | export DGXHT=2 # HT is on is 2, HT off is 1 19 | -------------------------------------------------------------------------------- /single_stage_detector/ssd/config_DGXA100_032x08x032.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ## DL params 4 | export BATCHSIZE=32 5 | export NUMEPOCHS=${NUMEPOCHS:-35} 6 | export DATASET_DIR="/datasets/open-images-v6-mlperf" 7 | export EXTRA_PARAMS='--lr 0.0001 --output-dir=/results' 8 | 9 | ## System run parms 10 | export DGXNNODES=32 11 | export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) 12 | export WALLTIME=04:00:00 13 | 14 | ## System config params 15 | export DGXNGPU=8 16 | export DGXSOCKETCORES=64 17 | export DGXNSOCKET=2 18 | export DGXHT=2 # HT is on is 2, HT off is 1 19 | -------------------------------------------------------------------------------- /single_stage_detector/ssd/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/single_stage_detector/ssd/model/__init__.py -------------------------------------------------------------------------------- /single_stage_detector/ssd/model/image_list.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from typing import List, Tuple 4 | 5 | 6 | class ImageList(object): 7 | """ 8 | Structure that holds a list of images (of possibly 9 | varying sizes) as a single tensor. 10 | This works by padding the images to the same size, 11 | and storing in a field the original sizes of each image 12 | """ 13 | 14 | def __init__(self, tensors: Tensor, image_sizes: List[Tuple[int, int]]): 15 | """ 16 | Args: 17 | tensors (tensor) 18 | image_sizes (list[tuple[int, int]]) 19 | """ 20 | self.tensors = tensors 21 | self.image_sizes = image_sizes 22 | 23 | def to(self, device: torch.device) -> 'ImageList': 24 | cast_tensor = self.tensors.to(device) 25 | return ImageList(cast_tensor, self.image_sizes) 26 | -------------------------------------------------------------------------------- /stable_diffusion/.dockerignore: -------------------------------------------------------------------------------- 1 | nogit/ 2 | mlperf_compliance.log 3 | -------------------------------------------------------------------------------- /stable_diffusion/.gitignore: -------------------------------------------------------------------------------- 1 | nogit/ 2 | mlperf_compliance.log 3 | -------------------------------------------------------------------------------- /stable_diffusion/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:24.01-py3 2 | FROM ${FROM_IMAGE_NAME} 3 | 4 | ENV DEBIAN_FRONTEND=noninteractive 5 | ENV RCLONE_VER=v1.67.0 6 | 7 | # apt dependencies 8 | RUN apt-get update 9 | RUN apt-get install -y ffmpeg libsm6 libxext6 10 | 11 | # Install rclone from upstream, see https://github.com/mlcommons/training/issues/751 12 | RUN wget https://github.com/rclone/rclone/releases/download/${RCLONE_VER}/rclone-${RCLONE_VER}-linux-amd64.zip \ 13 | && unzip rclone-${RCLONE_VER}-linux-amd64.zip \ 14 | && cd rclone-${RCLONE_VER}-linux-amd64 \ 15 | && cp rclone /usr/bin/ \ 16 | && chmod 755 /usr/bin/rclone \ 17 | && rm -rf /rclone-${RCLONE_VER}-linux-amd64* \ 18 | && rclone --version 19 | 20 | # Remove the opencv version shipped with the base image 21 | # https://github.com/opencv/opencv-python/issues/884 22 | RUN pip uninstall -y opencv 23 | RUN rm -rf /usr/local/lib/python3.10/dist-packages/cv2/ 24 | 25 | # install LDM 26 | COPY . /diffusion 27 | RUN cd /diffusion && \ 28 | pip install --no-cache-dir -r requirements.txt 29 | -------------------------------------------------------------------------------- /stable_diffusion/imgs/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/stable_diffusion/imgs/overview.png -------------------------------------------------------------------------------- /stable_diffusion/ldm/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/stable_diffusion/ldm/data/__init__.py -------------------------------------------------------------------------------- /stable_diffusion/ldm/data/tsv.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from torch.utils.data import Dataset, DataLoader 3 | 4 | 5 | class TsvDataset(Dataset): 6 | def __init__(self, annotations_file, keys): 7 | self.df = pd.read_csv(annotations_file, sep='\t', header=0) 8 | self.keys = keys 9 | 10 | def __len__(self): 11 | return len(self.df) 12 | 13 | def __getitem__(self, idx): 14 | sample = {} 15 | for key in self.keys: 16 | sample[key] = self.df[key].iloc[idx] 17 | return sample 18 | 19 | 20 | def build_dataloader(annotations_file, 21 | keys, 22 | batch_size, 23 | shuffle=False, 24 | num_workers=1, 25 | pin_memory=True): 26 | dataset = TsvDataset(annotations_file, keys=keys) 27 | return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=pin_memory) 28 | -------------------------------------------------------------------------------- /stable_diffusion/ldm/models/diffusion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/stable_diffusion/ldm/models/diffusion/__init__.py -------------------------------------------------------------------------------- /stable_diffusion/ldm/models/diffusion/dpm_solver/__init__.py: -------------------------------------------------------------------------------- 1 | from .sampler import DPMSolverSampler -------------------------------------------------------------------------------- /stable_diffusion/ldm/models/diffusion/sampling_util.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | def append_dims(x, target_dims): 6 | """Appends dimensions to the end of a tensor until it has target_dims dimensions. 7 | From https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/utils.py""" 8 | dims_to_append = target_dims - x.ndim 9 | if dims_to_append < 0: 10 | raise ValueError(f'input has {x.ndim} dims but target_dims is {target_dims}, which is less') 11 | return x[(...,) + (None,) * dims_to_append] 12 | 13 | 14 | def norm_thresholding(x0, value): 15 | s = append_dims(x0.pow(2).flatten(1).mean(1).sqrt().clamp(min=value), x0.ndim) 16 | return x0 * (value / s) 17 | 18 | 19 | def spatial_norm_thresholding(x0, value): 20 | # b c h w 21 | s = x0.pow(2).mean(1, keepdim=True).sqrt().clamp(min=value) 22 | return x0 * (value / s) -------------------------------------------------------------------------------- /stable_diffusion/ldm/modules/diffusionmodules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/stable_diffusion/ldm/modules/diffusionmodules/__init__.py -------------------------------------------------------------------------------- /stable_diffusion/ldm/modules/distributions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/stable_diffusion/ldm/modules/distributions/__init__.py -------------------------------------------------------------------------------- /stable_diffusion/ldm/modules/encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlcommons/training/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/stable_diffusion/ldm/modules/encoders/__init__.py -------------------------------------------------------------------------------- /stable_diffusion/ldm/modules/fid/README.md: -------------------------------------------------------------------------------- 1 | Copied from https://github.com/mseitzer/pytorch-fid@0a754fb 2 | -------------------------------------------------------------------------------- /stable_diffusion/requirements.txt: -------------------------------------------------------------------------------- 1 | albumentations==1.3.0 2 | opencv-python-headless==4.9.0.80 3 | pudb==2019.2 4 | prefetch_generator==1.0.3 5 | imageio==2.9.0 6 | imageio-ffmpeg==0.4.2 7 | torchmetrics==0.11.4 8 | omegaconf==2.1.1 9 | test-tube>=0.7.5 10 | streamlit>=0.73.1 11 | einops==0.3.0 12 | transformers==4.19.2 13 | webdataset==0.2.5 14 | open-clip-torch==2.7.0 15 | gradio==3.11 16 | lightning==1.9.4 17 | titans==0.0.7 18 | datasets==2.10.1 19 | colossalai==0.2.7 20 | invisible-watermark==0.1.5 21 | diffusers==0.14.0 22 | cloudpathlib==0.13.0 23 | xformers==0.0.24 24 | bitsandbytes==0.37.2 25 | git+https://github.com/mlcommons/logging.git@4.0.0-rc2 26 | -------------------------------------------------------------------------------- /stable_diffusion/scripts/checkpoints/download_clip.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | : "${OUTPUT_DIR:=/checkpoints/clip}" 4 | 5 | while [ "$1" != "" ]; do 6 | case $1 in 7 | -o | --output-dir ) shift 8 | OUTPUT_DIR=$1 9 | ;; 10 | esac 11 | shift 12 | done 13 | 14 | CLIP_WEIGHTS_URL="https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/resolve/main/open_clip_pytorch_model.bin" 15 | CLIP_WEIGHTS_SHA256="9a78ef8e8c73fd0df621682e7a8e8eb36c6916cb3c16b291a082ecd52ab79cc4" 16 | 17 | CLIP_CONFIG_URL="https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/raw/main/open_clip_config.json" 18 | 19 | wget -N -P ${OUTPUT_DIR} ${CLIP_WEIGHTS_URL} 20 | wget -N -P ${OUTPUT_DIR} ${CLIP_CONFIG_URL} 21 | echo "${CLIP_WEIGHTS_SHA256} ${OUTPUT_DIR}/open_clip_pytorch_model.bin" | sha256sum -c 22 | -------------------------------------------------------------------------------- /stable_diffusion/scripts/checkpoints/download_inception.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | : "${OUTPUT_DIR:=/checkpoints/inception}" 4 | 5 | while [ "$1" != "" ]; do 6 | case $1 in 7 | -o | --output-dir ) shift 8 | OUTPUT_DIR=$1 9 | ;; 10 | esac 11 | shift 12 | done 13 | 14 | FID_WEIGHTS_URL='https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth' 15 | FID_WEIGHTS_SHA1="bd836944fd6db519dfd8d924aa457f5b3c8357ff" 16 | 17 | wget -N -P ${OUTPUT_DIR} ${FID_WEIGHTS_URL} 18 | echo "${FID_WEIGHTS_SHA1} ${OUTPUT_DIR}/pt_inception-2015-12-05-6726825d.pth" | sha1sum -c 19 | -------------------------------------------------------------------------------- /stable_diffusion/scripts/checkpoints/download_sd.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | : "${OUTPUT_DIR:=/checkpoints/sd}" 4 | 5 | while [ "$1" != "" ]; do 6 | case $1 in 7 | -o | --output-dir ) shift 8 | OUTPUT_DIR=$1 9 | ;; 10 | esac 11 | shift 12 | done 13 | 14 | SD_WEIGHTS_URL='https://huggingface.co/stabilityai/stable-diffusion-2-base/resolve/main/512-base-ema.ckpt' 15 | SD_WEIGHTS_SHA256="d635794c1fedfdfa261e065370bea59c651fc9bfa65dc6d67ad29e11869a1824" 16 | 17 | wget -N -P ${OUTPUT_DIR} ${SD_WEIGHTS_URL} 18 | echo "${SD_WEIGHTS_SHA256} ${OUTPUT_DIR}/512-base-ema.ckpt" | sha256sum -c 19 | -------------------------------------------------------------------------------- /stable_diffusion/scripts/datasets/coco-2014-validation-download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | : "${DOWNLOAD_PATH:=/datasets/coco2014}" 4 | 5 | while [ "$1" != "" ]; do 6 | case $1 in 7 | -d | --download-path ) shift 8 | DOWNLOAD_PATH=$1 9 | ;; 10 | esac 11 | shift 12 | done 13 | 14 | mkdir -p ${DOWNLOAD_PATH} 15 | cd ${DOWNLOAD_PATH} 16 | 17 | wget -c http://images.cocodataset.org/zips/val2014.zip 18 | wget -c http://images.cocodataset.org/annotations/annotations_trainval2014.zip 19 | 20 | echo "fbedd73593f242db65cce6bcefde193fcedcc5c0 ./val2014.zip" | sha1sum -c 21 | echo "8e0b9df54c175f1688400e98d1a97f292e726870 ./annotations_trainval2014.zip" | sha1sum -c 22 | 23 | unzip val2014.zip 24 | unzip annotations_trainval2014.zip 25 | -------------------------------------------------------------------------------- /stable_diffusion/scripts/datasets/coco2014-validation-download-prompts.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | : "${OUTPUT_DIR:=/datasets/coco2014}" 4 | 5 | while [ "$1" != "" ]; do 6 | case $1 in 7 | -o | --output-dir ) shift 8 | OUTPUT_DIR=$1 9 | ;; 10 | esac 11 | shift 12 | done 13 | 14 | mkdir -p ${OUTPUT_DIR} 15 | 16 | rclone config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com 17 | 18 | rclone copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/coco2014/val2014_30k.tsv ${OUTPUT_DIR} -P 19 | 20 | -------------------------------------------------------------------------------- /stable_diffusion/scripts/datasets/coco2014-validation-download-stats.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | : "${OUTPUT_DIR:=/datasets/coco2014}" 4 | 5 | while [ "$1" != "" ]; do 6 | case $1 in 7 | -o | --output-dir ) shift 8 | OUTPUT_DIR=$1 9 | ;; 10 | esac 11 | shift 12 | done 13 | 14 | mkdir -p ${OUTPUT_DIR} 15 | 16 | rclone config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com 17 | 18 | rclone copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/coco2014/val2014_30k_stats.npz ${OUTPUT_DIR} -P 19 | 20 | -------------------------------------------------------------------------------- /stable_diffusion/scripts/datasets/generate-fid-statistics.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | : "${DATASET_DIR:=/datasets/coco2014/val2014_30k}" 4 | : "${OUTPUT_FILE:=/datasets/coco2014/val2014_30k_stats.npz}" 5 | 6 | while [ "$1" != "" ]; do 7 | case $1 in 8 | -d | --dataset-dir ) shift 9 | DATASET_DIR=$1 10 | ;; 11 | -o | --output-file ) shift 12 | OUTPUT_FILE=$1 13 | ;; 14 | esac 15 | shift 16 | done 17 | 18 | python ldm/modules/fid/fid_score.py --save-stats ${DATASET_DIR} ${OUTPUT_FILE} 19 | -------------------------------------------------------------------------------- /stable_diffusion/scripts/datasets/laion400m-convert-images-to-moments.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | : "${INPUT_FOLDER:=/datasets/laion-400m/webdataset-filtered}" 4 | : "${OUTPUT_FOLDER:=/datasets/laion-400m/webdataset-latents-filtered}" 5 | 6 | while [ "$1" != "" ]; do 7 | case $1 in 8 | -i | --input-folder ) shift 9 | INPUT_FOLDER=$1 10 | ;; 11 | -o | --output-folder ) shift 12 | OUTPUT_FOLDER=$1 13 | ;; 14 | esac 15 | shift 16 | done 17 | 18 | mkdir -p ${OUTPUT_FOLDER} 19 | 20 | # Loop over each tar file in the input directory 21 | for tar_file in ${INPUT_FOLDER}/*.tar; do 22 | file_name=$(basename "$tar_file") 23 | base_name="${file_name%.*}" 24 | python webdataset_images2latents.py \ 25 | --input-tar ${tar_file} \ 26 | --output-tar ${OUTPUT_FOLDER}/${base_name}.tar \ 27 | --config configs/train_512.yaml \ 28 | --ckpt /checkpoints/sd/512-base-ema.ckpt 29 | done 30 | -------------------------------------------------------------------------------- /stable_diffusion/scripts/datasets/laion400m-download-metadata.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | : "${OUTPUT_DIR:=/datasets/laion-400m/metadata}" 4 | 5 | while [ "$1" != "" ]; do 6 | case $1 in 7 | -o | --output-dir ) shift 8 | OUTPUT_DIR=$1 9 | ;; 10 | esac 11 | shift 12 | done 13 | 14 | mkdir -p ${OUTPUT_DIR} 15 | 16 | for i in {00000..00031}; do wget -N -P ${OUTPUT_DIR} https://the-eye.eu/public/AI/cah/laion400m-met-release/laion400m-meta/part-$i-5b54c5d5-bbcf-484d-a2ce-0d6f73df1a36-c000.snappy.parquet; done 17 | -------------------------------------------------------------------------------- /stable_diffusion/scripts/datasets/laion400m-filter-metadata.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | : "${INPUT_METADATA_DIR:=/datasets/laion-400m/metadata}" 4 | : "${OUTPUT_METADATA_DIR:=/datasets/laion-400m/metadata-filtered}" 5 | 6 | while [ "$1" != "" ]; do 7 | case $1 in 8 | -i | --input-metadata-dir ) shift 9 | INPUT_METADATA_DIR=$1 10 | ;; 11 | -o | --output-metadata-dir ) shift 12 | OUTPUT_METADATA_DIR=$1 13 | ;; 14 | esac 15 | shift 16 | done 17 | 18 | mkdir -p ${OUTPUT_METADATA_DIR} 19 | 20 | python scripts/datasets/filter-metadata.py \ 21 | --input-folder ${INPUT_METADATA_DIR} \ 22 | --output-folder ${OUTPUT_METADATA_DIR} 23 | -------------------------------------------------------------------------------- /stable_diffusion/scripts/datasets/laion400m-filtered-download-images.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | : "${OUTPUT_DIR:=/datasets/laion-400m/webdataset-filtered}" 4 | 5 | while [ "$1" != "" ]; do 6 | case $1 in 7 | -o | --output-dir ) shift 8 | OUTPUT_DIR=$1 9 | ;; 10 | esac 11 | shift 12 | done 13 | 14 | mkdir -p ${OUTPUT_DIR} 15 | cd ${OUTPUT_DIR} 16 | 17 | 18 | rclone config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com 19 | 20 | rclone copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/laion-400m/images-webdataset-filtered/ ${OUTPUT_DIR} --include="*.tar" -P 21 | 22 | rclone copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/laion-400m/images-webdataset-filtered/sha512sums.txt ${OUTPUT_DIR} -P 23 | 24 | sha512sum --quiet -c sha512sums.txt 25 | -------------------------------------------------------------------------------- /stable_diffusion/scripts/datasets/laion400m-filtered-download-moments.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | : "${OUTPUT_DIR:=/datasets/laion-400m/webdataset-moments-filtered}" 4 | 5 | while [ "$1" != "" ]; do 6 | case $1 in 7 | -o | --output-dir ) shift 8 | OUTPUT_DIR=$1 9 | ;; 10 | esac 11 | shift 12 | done 13 | 14 | mkdir -p ${OUTPUT_DIR} 15 | cd ${OUTPUT_DIR} 16 | 17 | 18 | rclone config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com 19 | 20 | rclone copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/laion-400m/moments-webdataset-filtered/ ${OUTPUT_DIR} --include="*.tar" -P 21 | 22 | rclone copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/laion-400m/moments-webdataset-filtered/sha512sums.txt ${OUTPUT_DIR} -P 23 | 24 | sha512sum --quiet -c sha512sums.txt 25 | -------------------------------------------------------------------------------- /stable_diffusion/scripts/docker/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | : "${SRC_IMG:=nvcr.io/nvidia/pytorch:24.01-py3}" 4 | : "${DST_IMG:=mlperf_sd:24.01-py3}" 5 | 6 | while [ "$1" != "" ]; do 7 | case $1 in 8 | -s | --src-img ) shift 9 | SRC_IMG=$1 10 | ;; 11 | -d | --dst-img ) shift 12 | DST_IMG=$1 13 | ;; 14 | esac 15 | shift 16 | done 17 | 18 | docker build -f Dockerfile . --rm -t ${DST_IMG} --build-arg FROM_IMAGE_NAME=${SRC_IMG} 19 | -------------------------------------------------------------------------------- /stable_diffusion/scripts/docker/launch.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | : "${DST_IMG:=mlperf_sd:22.12-py3}" 4 | 5 | while [ "$1" != "" ]; do 6 | case $1 in 7 | -d | --dst-img ) shift 8 | DST_IMG=$1 9 | ;; 10 | esac 11 | shift 12 | done 13 | 14 | docker run --rm -it --gpus=all --ipc=host \ 15 | -e PYTHONPYCACHEPREFIX=/tmp/.pycache \ 16 | --workdir /pwd \ 17 | -v ${PWD}:/pwd \ 18 | -v /datasets/laion-400m:/datasets/laion-400m \ 19 | -v /datasets/coco2014:/datasets/coco2014 \ 20 | -v /checkpoints:/checkpoints \ 21 | -v /results:/results \ 22 | ${DST_IMG} bash 23 | --------------------------------------------------------------------------------