├── blog ├── llm-finetuning │ ├── .detignore │ ├── .gitignore │ ├── startup-hook.sh │ ├── requirements.txt │ ├── distributed.yaml │ ├── chat_format.py │ └── README.md ├── llm-finetuning-2 │ ├── .detignore │ ├── .gitignore │ ├── startup-hook.sh │ ├── requirements.txt │ ├── deepspeed.yaml │ ├── lora.yaml │ └── ds_configs │ │ ├── ds_config_stage_1.json │ │ ├── ds_config_stage_2.json │ │ ├── ds_config_stage_3.json │ │ └── ds_config_stage_2_cpu_offload.json ├── llm-finetuning-3 │ ├── .detignore │ ├── .gitignore │ ├── startup-hook.sh │ ├── requirements.txt │ ├── chat_format.py │ ├── ds_configs │ │ ├── ds_config_stage_2.json │ │ ├── ds_config_stage_3.json │ │ ├── ds_config_stage_1.json │ │ └── ds_config_stage_2_cpu_offload.json │ └── dpo.yaml ├── lora-parameters │ ├── .detignore │ ├── .gitignore │ ├── startup-hook.sh │ ├── requirements.txt │ ├── README.md │ ├── ds_configs │ │ └── ds_config_stage_3.json │ └── lora.yaml ├── python_sdk_demo │ ├── mednist_model │ │ ├── requirements.txt │ │ ├── startup-hook.sh │ │ ├── config.yaml │ │ └── net.py │ ├── requirements.txt │ └── README.md ├── tp │ ├── matmul.png │ ├── mlp_tp.png │ ├── README.md │ ├── matmul_profiling.yaml │ ├── tp_profiling.yaml │ └── test_dot_product_local.py ├── README.md └── act-mem-2 │ ├── requirements.txt │ ├── README.md │ ├── attn_script.py │ ├── mlp_script.py │ └── block_script.py ├── computer_vision ├── detectron2_coco_pytorch │ ├── .detignore │ ├── metrics_by_time.png │ ├── Makefile │ ├── const_fake.yaml │ ├── mask_rcnn_R_50_FPN_noaug_1x.yaml │ ├── const.yaml │ ├── distributed.yaml │ ├── Dockerfile │ └── Base-RCNN-FPN.yaml ├── iris_tf_keras │ ├── startup-hook.sh │ ├── const.yaml │ ├── distributed.yaml │ ├── adaptive.yaml │ └── README.md ├── detr_coco_pytorch │ ├── imgs │ │ ├── val_curves.png │ │ ├── train_curves.png │ │ └── detr_architecture.png │ ├── startup-hook.sh │ ├── const_fake.yaml │ └── data_utils.py ├── efficientdet_pytorch │ ├── loss_by_gpus.png │ ├── Samples_per_sec.png │ ├── startup-hook.sh │ └── efficientdet_files │ │ └── utils.py ├── unets_tf_keras │ ├── Cumulative_Batches.png │ ├── Validation_Accuracy.png │ ├── startup-hook.sh │ ├── const.yaml │ └── distributed.yaml ├── cifar10_pytorch_inference │ ├── startup-hook.sh │ └── const.yaml ├── deformabledetr_coco_pytorch │ ├── imgs │ │ ├── val_curves.png │ │ └── train_curves.png │ ├── startup-hook.sh │ └── data_utils.py ├── byol_pytorch │ ├── startup-hook.sh │ ├── backbone.py │ ├── utils.py │ ├── evaluate_result.py │ └── generate_blob_list.py ├── fasterrcnn_coco_pytorch │ ├── const.yaml │ ├── adaptive.yaml │ └── README.md ├── cifar10_pytorch │ ├── const.yaml │ ├── distributed.yaml │ ├── adaptive.yaml │ ├── distributed_inference.yaml │ └── README.md └── cifar10_tf_keras │ ├── const.yaml │ ├── distributed.yaml │ ├── adaptive.yaml │ └── README.md ├── gan ├── pix2pix_tf_keras │ ├── .gitignore │ ├── .detignore │ ├── images │ │ ├── batches_vs_time.jpg │ │ ├── generated_example.jpeg │ │ ├── training_loss_vs_time.jpg │ │ └── validation_loss_vs_time.jpg │ ├── print_models.py │ ├── const.yaml │ ├── distributed.yaml │ ├── adaptive.yaml │ └── pix2pix │ │ └── sampling.py ├── dcgan_tf_keras │ ├── images │ │ └── dcgan_inference_example.png │ ├── const.yaml │ ├── distributed.yaml │ ├── data.py │ ├── README.md │ └── export.py ├── gan_mnist_pytorch │ ├── const.yaml │ ├── distributed.yaml │ ├── README.md │ └── data.py └── cyclegan │ ├── 1-gpu.yaml │ ├── 8-gpus.yaml │ ├── 64-gpus.yaml │ ├── startup-hook.sh │ ├── datasets.py │ └── utils.py ├── model_hub ├── mmdetection │ ├── hydra │ │ ├── configs │ │ │ ├── profiling │ │ │ │ ├── disabled.yaml │ │ │ │ └── enabled.yaml │ │ │ ├── data │ │ │ │ ├── disk.yaml │ │ │ │ ├── fake.yaml │ │ │ │ ├── gcs.yaml │ │ │ │ └── s3.yaml │ │ │ ├── hyperparameters │ │ │ │ ├── fp16.yaml │ │ │ │ ├── ann_file.yaml │ │ │ │ ├── base.yaml │ │ │ │ ├── tune_optimizer.yaml │ │ │ │ └── grad_clip.yaml │ │ │ ├── searcher │ │ │ │ ├── single.yaml │ │ │ │ └── adaptive.yaml │ │ │ └── config.yaml │ │ ├── mmdet_experiment.py │ │ └── README.md │ └── fasterrcnn.png └── huggingface │ ├── multiple-choice │ ├── figures │ │ └── swag.png │ └── swag_config.yaml │ ├── language-modeling │ ├── figures │ │ ├── clm.png │ │ ├── mlm.png │ │ └── plm.png │ ├── clm_config.yaml │ ├── mlm_config.yaml │ └── plm_config.yaml │ ├── question-answering │ ├── figures │ │ ├── squad.png │ │ ├── squad_v2.png │ │ ├── squad_v2_albert.png │ │ ├── squad_beam_search.png │ │ ├── squad_distributed.png │ │ └── squad_v2_beam_search.png │ ├── squad.yaml │ ├── squad_v2.yaml │ ├── squad_beam_search.yaml │ ├── squad_distributed.yaml │ ├── squad_v2_beam_search.yaml │ └── squad_v2_albert.yaml │ ├── text-classification │ ├── figures │ │ ├── glue.png │ │ └── xnli.png │ └── xnli_config.yaml │ └── token-classification │ ├── figures │ └── ner.png │ └── ner_config.yaml ├── features ├── ports_flask │ ├── startup-hook.sh │ ├── hello-client.yaml │ ├── hello-server.yaml │ ├── hello-server.py │ └── README.md ├── torch_batch_process_embeddings │ ├── startup-hook.sh │ ├── requirements.txt │ └── distributed.yaml ├── torch_batch_process_core_api_comparison │ ├── constants.py │ ├── core_api_config.yaml │ ├── torch_batch_process_config.yaml │ ├── model.py │ └── README.md ├── checkpoint_hooks_pytorch │ ├── const.yaml │ ├── layers.py │ ├── README.md │ └── data.py ├── custom_reducers_mnist_pytorch │ ├── const.yaml │ ├── layers.py │ ├── distributed.yaml │ ├── README.md │ └── data.py └── hp_constraints_mnist_pytorch │ ├── layers.py │ ├── adaptive.yaml │ ├── README.md │ └── data.py ├── hp_search_benchmarks ├── darts_cifar10_pytorch │ ├── startup-hook.sh │ ├── figures │ │ └── constrained_adaptive.png │ └── genotypes.py └── darts_penntreebank_pytorch │ ├── startup-hook.sh │ ├── const.yaml │ └── randomNAS_files │ └── genotypes.py ├── meta_learning └── protonet_omniglot_pytorch │ ├── startup-hook.sh │ ├── omniglot_20w1s.png │ ├── fetch_data.sh │ ├── 20way1shot.yaml │ └── 20way5shot.yaml ├── nas └── gaea_pytorch │ ├── eval │ └── top5_val.png │ └── search │ ├── optimizer.py │ ├── const.yaml │ └── data.py ├── nlp ├── bert_glue_pytorch │ ├── startup-hook.sh │ ├── const.yaml │ ├── distributed.yaml │ └── constants.py ├── word_language_model │ ├── validation_loss_table.png │ ├── const.yaml │ └── distributed.yaml └── albert_squad_pytorch │ ├── startup-hook.sh │ ├── constants.py │ ├── distributed_8gpu.yaml │ ├── distributed_64gpu.yaml │ └── const.yaml ├── graphs └── proteins_pytorch_geometric │ ├── startup-hook.sh │ ├── const.yaml │ ├── distributed.yaml │ └── adaptive.yaml ├── custom_search_method └── asha_search_method │ ├── remote_search_runner │ ├── searcher.yaml │ └── README.md │ ├── experiment_files │ ├── config.yaml │ ├── layers.py │ └── data.py │ └── local_search_runner │ └── README.md ├── tutorials └── fashion_mnist_tf_keras │ ├── const.yaml │ ├── distributed.yaml │ ├── adaptive.yaml │ └── README.md ├── deepspeed ├── deepspeed_dcgan │ ├── ds_config.json │ ├── mnist.yaml │ ├── mnist_grad_accum.yaml │ └── cifar10_zero2.yaml ├── pipeline_parallelism │ ├── ds_config.json │ ├── README.md │ ├── distributed.yaml │ └── alexnet.py ├── cifar10_cpu_offloading │ ├── ds_config_no_offload.json │ ├── zero_3_cpu_offload.yaml │ ├── zero_no_offload.yaml │ └── ds_config_offload.json └── cifar10_moe │ ├── ds_config.json │ ├── moe.yaml │ ├── zero_stages.yaml │ └── README.md ├── .github └── workflows │ └── check_markdown_links.yaml ├── fsdp └── minimal_fsdp │ └── config.yaml └── .gitignore /blog/llm-finetuning/.detignore: -------------------------------------------------------------------------------- 1 | text-to-sql* 2 | checkpoints -------------------------------------------------------------------------------- /blog/llm-finetuning-2/.detignore: -------------------------------------------------------------------------------- 1 | text-to-sql* 2 | checkpoints -------------------------------------------------------------------------------- /blog/llm-finetuning-3/.detignore: -------------------------------------------------------------------------------- 1 | text-to-sql* 2 | checkpoints -------------------------------------------------------------------------------- /blog/lora-parameters/.detignore: -------------------------------------------------------------------------------- 1 | text-to-sql* 2 | checkpoints -------------------------------------------------------------------------------- /computer_vision/detectron2_coco_pytorch/.detignore: -------------------------------------------------------------------------------- 1 | Dockerfile 2 | -------------------------------------------------------------------------------- /computer_vision/iris_tf_keras/startup-hook.sh: -------------------------------------------------------------------------------- 1 | pip install pandas 2 | -------------------------------------------------------------------------------- /gan/pix2pix_tf_keras/.gitignore: -------------------------------------------------------------------------------- 1 | logs/ 2 | *.png 3 | checkpoints/ 4 | -------------------------------------------------------------------------------- /blog/python_sdk_demo/mednist_model/requirements.txt: -------------------------------------------------------------------------------- 1 | medmnist 2 | wget 3 | -------------------------------------------------------------------------------- /model_hub/mmdetection/hydra/configs/profiling/disabled.yaml: -------------------------------------------------------------------------------- 1 | enabled: false 2 | -------------------------------------------------------------------------------- /features/ports_flask/startup-hook.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pip install flask 4 | -------------------------------------------------------------------------------- /gan/pix2pix_tf_keras/.detignore: -------------------------------------------------------------------------------- 1 | checkpoints/ 2 | images/ 3 | *.png 4 | *.jpeg 5 | -------------------------------------------------------------------------------- /hp_search_benchmarks/darts_cifar10_pytorch/startup-hook.sh: -------------------------------------------------------------------------------- 1 | pip install attrdict 2 | -------------------------------------------------------------------------------- /blog/llm-finetuning/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .DS_STORE 3 | text-to-sql* 4 | checkpoints -------------------------------------------------------------------------------- /blog/python_sdk_demo/requirements.txt: -------------------------------------------------------------------------------- 1 | determined>=0.26.4 2 | medmnist 3 | PyYAML 4 | -------------------------------------------------------------------------------- /hp_search_benchmarks/darts_penntreebank_pytorch/startup-hook.sh: -------------------------------------------------------------------------------- 1 | pip install wget 2 | -------------------------------------------------------------------------------- /model_hub/mmdetection/hydra/configs/data/disk.yaml: -------------------------------------------------------------------------------- 1 | file_client_args: 2 | backend: disk 3 | -------------------------------------------------------------------------------- /model_hub/mmdetection/hydra/configs/data/fake.yaml: -------------------------------------------------------------------------------- 1 | file_client_args: 2 | backend: fake 3 | -------------------------------------------------------------------------------- /blog/llm-finetuning-2/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .DS_STORE 3 | text-to-sql* 4 | checkpoints 5 | *.png -------------------------------------------------------------------------------- /blog/llm-finetuning-3/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .DS_STORE 3 | text-to-sql* 4 | checkpoints 5 | *.png -------------------------------------------------------------------------------- /blog/lora-parameters/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .DS_STORE 3 | text-to-sql* 4 | checkpoints 5 | *.png -------------------------------------------------------------------------------- /blog/tp/matmul.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/blog/tp/matmul.png -------------------------------------------------------------------------------- /blog/tp/mlp_tp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/blog/tp/mlp_tp.png -------------------------------------------------------------------------------- /blog/README.md: -------------------------------------------------------------------------------- 1 | This directory hosts example code used in the [Determined AI blog](https://www.determined.ai/blog). -------------------------------------------------------------------------------- /blog/lora-parameters/startup-hook.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pip install --upgrade pip 3 | pip install -r requirements.txt -------------------------------------------------------------------------------- /features/torch_batch_process_embeddings/startup-hook.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pip install -r requirements.txt 4 | -------------------------------------------------------------------------------- /model_hub/mmdetection/hydra/configs/data/gcs.yaml: -------------------------------------------------------------------------------- 1 | file_client_args: 2 | backend: gcs 3 | bucket_name: ??? 4 | -------------------------------------------------------------------------------- /model_hub/mmdetection/hydra/configs/data/s3.yaml: -------------------------------------------------------------------------------- 1 | file_client_args: 2 | backend: s3 3 | bucket_name: ??? 4 | -------------------------------------------------------------------------------- /blog/llm-finetuning-3/startup-hook.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pip install --upgrade pip 3 | pip install -r requirements.txt -------------------------------------------------------------------------------- /blog/llm-finetuning/startup-hook.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pip install --upgrade pip 3 | pip install -r requirements.txt 4 | -------------------------------------------------------------------------------- /model_hub/mmdetection/hydra/configs/hyperparameters/fp16.yaml: -------------------------------------------------------------------------------- 1 | override_mmdet_config: 2 | fp16.loss_scale: 512. 3 | -------------------------------------------------------------------------------- /blog/llm-finetuning-2/startup-hook.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | pip install --upgrade pip 3 | pip install -r requirements.txt 4 | -------------------------------------------------------------------------------- /meta_learning/protonet_omniglot_pytorch/startup-hook.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | apt-get install unzip 3 | 4 | ./fetch_data.sh 5 | -------------------------------------------------------------------------------- /model_hub/mmdetection/hydra/configs/profiling/enabled.yaml: -------------------------------------------------------------------------------- 1 | enabled: true 2 | begin_on_batch: ??? 3 | end_after_batch: ??? 4 | 5 | -------------------------------------------------------------------------------- /blog/llm-finetuning/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.36.2 2 | datasets==2.16.1 3 | evaluate==0.4.1 4 | trl==0.7.9 5 | scikit-learn==1.4.0 -------------------------------------------------------------------------------- /nas/gaea_pytorch/eval/top5_val.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/nas/gaea_pytorch/eval/top5_val.png -------------------------------------------------------------------------------- /nlp/bert_glue_pytorch/startup-hook.sh: -------------------------------------------------------------------------------- 1 | pip install transformers==2.8.0 scikit-learn==0.22.2.post1 2 | pip install sentencepiece==0.1.91 3 | -------------------------------------------------------------------------------- /model_hub/mmdetection/fasterrcnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/mmdetection/fasterrcnn.png -------------------------------------------------------------------------------- /model_hub/mmdetection/hydra/configs/hyperparameters/ann_file.yaml: -------------------------------------------------------------------------------- 1 | override_mmdet_config: 2 | data.train.ann_file: ??? 3 | data.val.ann_file: ??? 4 | -------------------------------------------------------------------------------- /model_hub/mmdetection/hydra/configs/hyperparameters/base.yaml: -------------------------------------------------------------------------------- 1 | global_batch_size: 16 2 | config_file: ??? 3 | merge_config: null 4 | use_pretrained: false 5 | -------------------------------------------------------------------------------- /blog/llm-finetuning-3/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.39.1 2 | datasets==2.17.0 3 | evaluate==0.4.1 4 | trl==0.8.1 5 | scikit-learn==1.4.0 6 | deepspeed==0.10.2 -------------------------------------------------------------------------------- /features/torch_batch_process_embeddings/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate>=0.12.0 2 | transformers>=4.28.1,<4.29.0 3 | tokenizers>=0.13.3 4 | datasets 5 | chromadb 6 | -------------------------------------------------------------------------------- /gan/pix2pix_tf_keras/images/batches_vs_time.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/gan/pix2pix_tf_keras/images/batches_vs_time.jpg -------------------------------------------------------------------------------- /model_hub/mmdetection/hydra/configs/searcher/single.yaml: -------------------------------------------------------------------------------- 1 | name: single 2 | metric: bbox_mAP 3 | max_length: 4 | batches: 87850 5 | smaller_is_better: false 6 | -------------------------------------------------------------------------------- /nlp/word_language_model/validation_loss_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/nlp/word_language_model/validation_loss_table.png -------------------------------------------------------------------------------- /gan/pix2pix_tf_keras/images/generated_example.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/gan/pix2pix_tf_keras/images/generated_example.jpeg -------------------------------------------------------------------------------- /blog/llm-finetuning-2/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.37.2 2 | datasets==2.17.0 3 | evaluate==0.4.1 4 | trl==0.7.10 5 | scikit-learn==1.4.0 6 | deepspeed==0.10.2 7 | peft==0.8.2 -------------------------------------------------------------------------------- /computer_vision/detr_coco_pytorch/imgs/val_curves.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/detr_coco_pytorch/imgs/val_curves.png -------------------------------------------------------------------------------- /computer_vision/efficientdet_pytorch/loss_by_gpus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/efficientdet_pytorch/loss_by_gpus.png -------------------------------------------------------------------------------- /computer_vision/unets_tf_keras/Cumulative_Batches.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/unets_tf_keras/Cumulative_Batches.png -------------------------------------------------------------------------------- /computer_vision/unets_tf_keras/Validation_Accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/unets_tf_keras/Validation_Accuracy.png -------------------------------------------------------------------------------- /gan/dcgan_tf_keras/images/dcgan_inference_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/gan/dcgan_tf_keras/images/dcgan_inference_example.png -------------------------------------------------------------------------------- /gan/pix2pix_tf_keras/images/training_loss_vs_time.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/gan/pix2pix_tf_keras/images/training_loss_vs_time.jpg -------------------------------------------------------------------------------- /model_hub/huggingface/multiple-choice/figures/swag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/multiple-choice/figures/swag.png -------------------------------------------------------------------------------- /computer_vision/cifar10_pytorch_inference/startup-hook.sh: -------------------------------------------------------------------------------- 1 | pip install gdown 2 | gdown https://drive.google.com/uc?id=1JTchzEFqtjbAVWXyNa5BkYPi12_CoHlS -O state_dicts/resnet18.pt 3 | -------------------------------------------------------------------------------- /computer_vision/detr_coco_pytorch/imgs/train_curves.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/detr_coco_pytorch/imgs/train_curves.png -------------------------------------------------------------------------------- /computer_vision/efficientdet_pytorch/Samples_per_sec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/efficientdet_pytorch/Samples_per_sec.png -------------------------------------------------------------------------------- /gan/pix2pix_tf_keras/images/validation_loss_vs_time.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/gan/pix2pix_tf_keras/images/validation_loss_vs_time.jpg -------------------------------------------------------------------------------- /model_hub/huggingface/language-modeling/figures/clm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/language-modeling/figures/clm.png -------------------------------------------------------------------------------- /model_hub/huggingface/language-modeling/figures/mlm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/language-modeling/figures/mlm.png -------------------------------------------------------------------------------- /model_hub/huggingface/language-modeling/figures/plm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/language-modeling/figures/plm.png -------------------------------------------------------------------------------- /computer_vision/detectron2_coco_pytorch/metrics_by_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/detectron2_coco_pytorch/metrics_by_time.png -------------------------------------------------------------------------------- /meta_learning/protonet_omniglot_pytorch/omniglot_20w1s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/meta_learning/protonet_omniglot_pytorch/omniglot_20w1s.png -------------------------------------------------------------------------------- /model_hub/huggingface/question-answering/figures/squad.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/question-answering/figures/squad.png -------------------------------------------------------------------------------- /model_hub/huggingface/text-classification/figures/glue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/text-classification/figures/glue.png -------------------------------------------------------------------------------- /model_hub/huggingface/text-classification/figures/xnli.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/text-classification/figures/xnli.png -------------------------------------------------------------------------------- /model_hub/huggingface/token-classification/figures/ner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/token-classification/figures/ner.png -------------------------------------------------------------------------------- /computer_vision/detr_coco_pytorch/imgs/detr_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/detr_coco_pytorch/imgs/detr_architecture.png -------------------------------------------------------------------------------- /model_hub/huggingface/question-answering/figures/squad_v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/question-answering/figures/squad_v2.png -------------------------------------------------------------------------------- /model_hub/mmdetection/hydra/configs/hyperparameters/tune_optimizer.yaml: -------------------------------------------------------------------------------- 1 | override_mmdet_config: 2 | optimizer.lr: 3 | type: log 4 | base: 10 5 | minval: -3 6 | maxval: -1 7 | -------------------------------------------------------------------------------- /blog/lora-parameters/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.37.2 2 | datasets==2.17.0 3 | evaluate==0.4.1 4 | trl==0.7.10 5 | scikit-learn==1.4.0 6 | deepspeed==0.10.2 7 | peft==0.8.2 8 | huggingface_hub -------------------------------------------------------------------------------- /computer_vision/deformabledetr_coco_pytorch/imgs/val_curves.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/deformabledetr_coco_pytorch/imgs/val_curves.png -------------------------------------------------------------------------------- /computer_vision/deformabledetr_coco_pytorch/imgs/train_curves.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/deformabledetr_coco_pytorch/imgs/train_curves.png -------------------------------------------------------------------------------- /features/torch_batch_process_core_api_comparison/constants.py: -------------------------------------------------------------------------------- 1 | DATA_DIRECTORY = "/tmp/data/cifar10" 2 | LOCK_FILE = "/tmp/data/cifar10/cifar10.lock" 3 | PREDICTIONS_DIRECTORY = "/tmp/inference_out/" 4 | -------------------------------------------------------------------------------- /model_hub/huggingface/question-answering/figures/squad_v2_albert.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/question-answering/figures/squad_v2_albert.png -------------------------------------------------------------------------------- /model_hub/huggingface/question-answering/figures/squad_beam_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/question-answering/figures/squad_beam_search.png -------------------------------------------------------------------------------- /model_hub/huggingface/question-answering/figures/squad_distributed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/question-answering/figures/squad_distributed.png -------------------------------------------------------------------------------- /model_hub/huggingface/question-answering/figures/squad_v2_beam_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/question-answering/figures/squad_v2_beam_search.png -------------------------------------------------------------------------------- /hp_search_benchmarks/darts_cifar10_pytorch/figures/constrained_adaptive.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/hp_search_benchmarks/darts_cifar10_pytorch/figures/constrained_adaptive.png -------------------------------------------------------------------------------- /computer_vision/detectron2_coco_pytorch/Makefile: -------------------------------------------------------------------------------- 1 | TAG := determinedai/example-detectron2:0.6-cuda-10.2-pytorch-1.10 2 | 3 | .PHONY: build 4 | build: 5 | docker build -f Dockerfile -t $(TAG) . && \ 6 | docker push $(TAG) 7 | -------------------------------------------------------------------------------- /graphs/proteins_pytorch_geometric/startup-hook.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pip install torch_geometric==2.2.0 4 | pip install torch_sparse==0.6.16 torch_scatter==2.1.0 -f https://pytorch-geometric.com/whl/torch-1.12.0+cu113.html 5 | -------------------------------------------------------------------------------- /model_hub/mmdetection/hydra/configs/hyperparameters/grad_clip.yaml: -------------------------------------------------------------------------------- 1 | override_mmdet_config: 2 | optimizer_config._delete_: true 3 | optimizer_config.grad_clip.max_norm: ??? 4 | optimizer_config.grad_clip.norm_type: ??? 5 | -------------------------------------------------------------------------------- /model_hub/mmdetection/hydra/configs/searcher/adaptive.yaml: -------------------------------------------------------------------------------- 1 | name: adaptive_asha 2 | metric: bbox_mAP 3 | max_length: 4 | batches: 87850 5 | max_trials: 100 6 | mode: aggressive 7 | max_rungs: 4 8 | smaller_is_better: false 9 | -------------------------------------------------------------------------------- /computer_vision/unets_tf_keras/startup-hook.sh: -------------------------------------------------------------------------------- 1 | pip install "setuptools<66" # necessary for installing tensorflow/examples for some reason 2 | pip install git+https://github.com/tensorflow/examples.git 3 | pip install -q -U tfds-nightly 4 | -------------------------------------------------------------------------------- /features/ports_flask/hello-client.yaml: -------------------------------------------------------------------------------- 1 | name: hello-client 2 | entrypoint: python3 hello-client.py 3 | resources: 4 | slots: 0 5 | max_restarts: 0 6 | 7 | searcher: 8 | name: single 9 | metric: x 10 | max_length: 10000000 11 | -------------------------------------------------------------------------------- /blog/act-mem-2/requirements.txt: -------------------------------------------------------------------------------- 1 | einops==0.8.0 2 | filelock==3.14.0 3 | fsspec==2024.5.0 4 | iniconfig==2.0.0 5 | Jinja2==3.1.4 6 | MarkupSafe==2.1.5 7 | mpmath==1.3.0 8 | networkx==3.3 9 | packaging==24.0 10 | pluggy==1.5.0 11 | pytest==8.2.1 12 | sympy==1.12.1 13 | torch==2.3.0 14 | typing_extensions==4.12.0 15 | -------------------------------------------------------------------------------- /blog/python_sdk_demo/mednist_model/startup-hook.sh: -------------------------------------------------------------------------------- 1 | # This file is executed as the container is started up that this model will be 2 | # run on. 3 | # 4 | # For more information, see 5 | # https://docs.determined.ai/latest/model-dev-guide/prepare-container/custom-env.html#startup-hook 6 | 7 | pip install -r requirements.txt 8 | -------------------------------------------------------------------------------- /custom_search_method/asha_search_method/remote_search_runner/searcher.yaml: -------------------------------------------------------------------------------- 1 | name: remote-search-runner 2 | entrypoint: python3 remote_search_runner/run_experiment.py 3 | searcher: 4 | metric: validation_error 5 | smaller_is_better: true 6 | name: single 7 | max_length: 8 | batches: 1000 9 | max_restarts: 0 10 | -------------------------------------------------------------------------------- /computer_vision/byol_pytorch/startup-hook.sh: -------------------------------------------------------------------------------- 1 | # Copy LARS implementation from upstream repo. 2 | git clone https://github.com/untitled-ai/self_supervised.git 3 | (cd self_supervised && git checkout 6d14ca0402ecc13feda9b3a9fdc056fd1ac24473) 4 | cp self_supervised/lars.py ./ 5 | python3 -m pip install attrdict byol-pytorch filelock 6 | -------------------------------------------------------------------------------- /nlp/albert_squad_pytorch/startup-hook.sh: -------------------------------------------------------------------------------- 1 | # Very important to pin sentencepiece as the newer version causes segementation faults (as of Oct 2020) 2 | pip install sentencepiece==0.1.91 3 | pip install transformers==3.1.0 4 | pip install -e git+git://github.com/LiyuanLucasLiu/RAdam.git@baf4f65445c00d686d4098841b3ca1f62a886326#egg=radam 5 | -------------------------------------------------------------------------------- /tutorials/fashion_mnist_tf_keras/const.yaml: -------------------------------------------------------------------------------- 1 | name: fashion_mnist_tf_keras_const 2 | hyperparameters: 3 | global_batch_size: 32 4 | dense1: 128 5 | records_per_epoch: 60000 6 | searcher: 7 | name: single 8 | metric: val_accuracy 9 | smaller_is_better: false 10 | max_length: 11 | epochs: 5 12 | entrypoint: model_def:FashionMNISTTrial 13 | -------------------------------------------------------------------------------- /computer_vision/cifar10_pytorch_inference/const.yaml: -------------------------------------------------------------------------------- 1 | description: cifar10_pytorch_inference_const 2 | hyperparameters: 3 | global_batch_size: 8 4 | records_per_epoch: 50000 5 | searcher: 6 | name: single 7 | metric: validation_error 8 | max_length: 9 | epochs: 1 10 | entrypoint: model_def:CIFARTrial 11 | min_validation_period: 12 | epochs: 1 13 | -------------------------------------------------------------------------------- /custom_search_method/asha_search_method/experiment_files/config.yaml: -------------------------------------------------------------------------------- 1 | name: mnist-custom-search-experiment 2 | data: 3 | url: https://s3-us-west-2.amazonaws.com/determined-ai-test-data/pytorch_mnist.tar.gz 4 | searcher: 5 | name: custom 6 | metric: validation_loss 7 | smaller_is_better: true 8 | unit: batches 9 | entrypoint: model_def:MNistTrial 10 | -------------------------------------------------------------------------------- /features/ports_flask/hello-server.yaml: -------------------------------------------------------------------------------- 1 | name: hello-server 2 | entrypoint: python3 hello-server.py 3 | resources: 4 | slots: 0 5 | max_restarts: 0 6 | environment: 7 | proxy_ports: 8 | - proxy_port: 5000 9 | proxy_tcp: true 10 | unauthenticated: true 11 | 12 | searcher: 13 | name: single 14 | metric: x 15 | max_length: 10000000 16 | -------------------------------------------------------------------------------- /gan/dcgan_tf_keras/const.yaml: -------------------------------------------------------------------------------- 1 | name: dc_gan 2 | hyperparameters: 3 | noise_dim: 128 4 | global_batch_size: 256 5 | discriminator_lr: 0.0001 6 | generator_lr: 0.0001 7 | records_per_epoch: 50000 8 | searcher: 9 | name: single 10 | metric: "val_d_loss" 11 | smaller_is_better: true 12 | max_length: 13 | epochs: 50 14 | entrypoint: model_def:DCGanTrial 15 | -------------------------------------------------------------------------------- /deepspeed/deepspeed_dcgan/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 64, 3 | "optimizer": { 4 | "type": "Adam", 5 | "params": { 6 | "lr": 0.0002, 7 | "betas": [ 8 | 0.5, 9 | 0.999 10 | ], 11 | "eps": 1e-8 12 | } 13 | }, 14 | "steps_per_print": 10 15 | } 16 | -------------------------------------------------------------------------------- /.github/workflows/check_markdown_links.yaml: -------------------------------------------------------------------------------- 1 | name: Check Markdown links 2 | on: 3 | pull_request: 4 | branches: 5 | - main 6 | 7 | jobs: 8 | markdown-link-check: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@master 12 | - uses: gaurav-nelson/github-action-markdown-link-check@v1 13 | with: 14 | use-quiet-mode: 'yes' 15 | -------------------------------------------------------------------------------- /features/ports_flask/hello-server.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, jsonify, request 2 | 3 | app = Flask(__name__) 4 | 5 | 6 | @app.route("/hello", methods=["GET"]) 7 | def hello(): 8 | if request.method == "GET": 9 | data = {"data": "Hello World"} 10 | return jsonify(data) 11 | 12 | 13 | if __name__ == "__main__": 14 | app.run(host="0.0.0.0", port=5000, debug=True) 15 | -------------------------------------------------------------------------------- /hp_search_benchmarks/darts_cifar10_pytorch/genotypes.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | Genotype = namedtuple("Genotype", "normal normal_concat reduce reduce_concat") 4 | 5 | PRIMITIVES = [ 6 | "none", 7 | "max_pool_3x3", 8 | "avg_pool_3x3", 9 | "skip_connect", 10 | "sep_conv_3x3", 11 | "sep_conv_5x5", 12 | "dil_conv_3x3", 13 | "dil_conv_5x5", 14 | ] 15 | -------------------------------------------------------------------------------- /tutorials/fashion_mnist_tf_keras/distributed.yaml: -------------------------------------------------------------------------------- 1 | name: fashion_mnist_tf_keras_distributed 2 | hyperparameters: 3 | global_batch_size: 256 4 | dense1: 128 5 | resources: 6 | slots_per_trial: 8 7 | records_per_epoch: 60000 8 | searcher: 9 | name: single 10 | metric: val_accuracy 11 | smaller_is_better: false 12 | max_length: 13 | epochs: 5 14 | entrypoint: model_def:FashionMNISTTrial 15 | -------------------------------------------------------------------------------- /computer_vision/efficientdet_pytorch/startup-hook.sh: -------------------------------------------------------------------------------- 1 | pip install timm==0.3.1 2 | # pycocotools 2.0.5, a dependency of efficientdet-pytorch, 3 | # would not install without cython 4 | pip install pycocotools==2.0.4 5 | git clone https://github.com/rwightman/efficientdet-pytorch.git 6 | cd efficientdet-pytorch 7 | git checkout 611532db49fdd691f48f913bc433391a12014bd8 8 | python setup.py install 9 | cd .. 10 | -------------------------------------------------------------------------------- /deepspeed/pipeline_parallelism/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size" : 256, 3 | "train_micro_batch_size_per_gpu" : 8, 4 | 5 | "optimizer": { 6 | "type": "Adam", 7 | "params": { 8 | "lr": 0.001, 9 | "betas": [ 10 | 0.9, 11 | 0.999 12 | ], 13 | "eps": 1e-8 14 | } 15 | }, 16 | 17 | "steps_per_print" : 100, 18 | "wall_clock_breakdown" : false 19 | } 20 | -------------------------------------------------------------------------------- /gan/dcgan_tf_keras/distributed.yaml: -------------------------------------------------------------------------------- 1 | name: dc_gan 2 | hyperparameters: 3 | noise_dim: 128 4 | global_batch_size: 1024 5 | discriminator_lr: 0.00003 6 | generator_lr: 0.00003 7 | records_per_epoch: 50000 8 | searcher: 9 | name: single 10 | metric: "val_d_loss" 11 | smaller_is_better: true 12 | max_length: 13 | epochs: 50 14 | entrypoint: model_def:DCGanTrial 15 | resources: 16 | slots_per_trial: 4 17 | -------------------------------------------------------------------------------- /features/torch_batch_process_embeddings/distributed.yaml: -------------------------------------------------------------------------------- 1 | name: bert_embedding_generation 2 | entrypoint: >- 3 | python3 -m determined.launch.torch_distributed 4 | python3 bert_embedding_generation.py 5 | 6 | resources: 7 | slots_per_trial: 2 8 | 9 | searcher: 10 | name: single 11 | metric: x 12 | max_length: 100 13 | 14 | max_restarts: 0 15 | bind_mounts: 16 | - host_path: /tmp 17 | container_path: /tmp 18 | -------------------------------------------------------------------------------- /gan/gan_mnist_pytorch/const.yaml: -------------------------------------------------------------------------------- 1 | name: gan_mnist_pytorch_const 2 | data: 3 | url: "https://s3-us-west-2.amazonaws.com/determined-ai-test-data/pytorch_mnist.tar.gz" 4 | hyperparameters: 5 | global_batch_size: 32 6 | lr: 0.0002 7 | b1: 0.5 8 | b2: 0.999 9 | latent_dim: 100 10 | searcher: 11 | name: single 12 | metric: loss 13 | max_length: 14 | batches: 40000 15 | smaller_is_better: True 16 | entrypoint: model_def:GANTrial 17 | -------------------------------------------------------------------------------- /tutorials/fashion_mnist_tf_keras/adaptive.yaml: -------------------------------------------------------------------------------- 1 | name: fashion_mnist_tf_keras_adaptive_search 2 | hyperparameters: 3 | global_batch_size: 32 4 | dense1: 5 | type: int 6 | minval: 32 7 | maxval: 256 8 | records_per_epoch: 60000 9 | searcher: 10 | name: adaptive_asha 11 | metric: val_accuracy 12 | smaller_is_better: false 13 | max_length: 14 | epochs: 5 15 | max_trials: 10 16 | entrypoint: model_def:FashionMNISTTrial 17 | -------------------------------------------------------------------------------- /features/torch_batch_process_core_api_comparison/core_api_config.yaml: -------------------------------------------------------------------------------- 1 | name: core_api_batch_inference 2 | entrypoint: >- 3 | python3 -m determined.launch.torch_distributed 4 | python3 core_api_inference.py 5 | 6 | resources: 7 | slots_per_trial: 2 8 | 9 | searcher: 10 | name: single 11 | metric: x 12 | max_length: 100 13 | max_restarts: 2 14 | bind_mounts: 15 | - host_path: /tmp 16 | container_path: /tmp 17 | read_only: false 18 | -------------------------------------------------------------------------------- /nlp/albert_squad_pytorch/constants.py: -------------------------------------------------------------------------------- 1 | from transformers import ( 2 | AlbertConfig, 3 | AlbertForQuestionAnswering, 4 | AlbertTokenizer, 5 | BertConfig, 6 | BertForQuestionAnswering, 7 | BertTokenizer, 8 | ) 9 | 10 | MODEL_CLASSES = { 11 | "bert": (BertConfig, BertTokenizer, BertForQuestionAnswering), 12 | "albert": ( 13 | AlbertConfig, 14 | AlbertTokenizer, 15 | AlbertForQuestionAnswering, 16 | ), 17 | } 18 | -------------------------------------------------------------------------------- /blog/tp/README.md: -------------------------------------------------------------------------------- 1 | # Tensor Parallelism 2 | 3 | Code accompanying the deep-dive [blog post on Tensor Parallelism](https://determined.ai/blog/tp). 4 | 5 | - The MLP and TP MLP layers are in `layer.py` 6 | - Matmul profiling code in `matmul_profiling.py` 7 | - MLP TP profiling code in `tp_profiling.py` 8 | - Tests of the rearranging tensor sums are in `test_dot_product_{local,distributed}.py` 9 | 10 | 11 | ## Contributors 12 | 13 | - [Garrett Goon](https://github.com/garrett361) -------------------------------------------------------------------------------- /features/torch_batch_process_core_api_comparison/torch_batch_process_config.yaml: -------------------------------------------------------------------------------- 1 | name: torch_batch_process_batch_inference 2 | entrypoint: >- 3 | python3 -m determined.launch.torch_distributed 4 | python3 torch_batch_process_inference.py 5 | 6 | resources: 7 | slots_per_trial: 2 8 | 9 | searcher: 10 | name: single 11 | metric: x 12 | max_length: 100 13 | 14 | max_restarts: 2 15 | bind_mounts: 16 | - host_path: /tmp 17 | container_path: /tmp 18 | read_only: false 19 | -------------------------------------------------------------------------------- /computer_vision/fasterrcnn_coco_pytorch/const.yaml: -------------------------------------------------------------------------------- 1 | name: fasterrcnn_coco_pytorch_const 2 | data: 3 | url: https://determined-ai-public-datasets.s3-us-west-2.amazonaws.com/PennFudanPed/PennFudanPed.zip 4 | hyperparameters: 5 | learning_rate: 0.005 6 | momentum: 0.9 7 | weight_decay: 0.0005 8 | global_batch_size: 2 9 | searcher: 10 | name: single 11 | metric: val_avg_iou 12 | smaller_is_better: false 13 | max_length: 14 | batches: 800 15 | entrypoint: model_def:ObjectDetectionTrial 16 | -------------------------------------------------------------------------------- /gan/pix2pix_tf_keras/print_models.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from pix2pix import make_discriminator_model, make_generator_model 3 | 4 | 5 | def main(): 6 | generator = make_generator_model() 7 | tf.keras.utils.plot_model(generator, show_shapes=True, dpi=64, to_file="generator.png") 8 | discriminator = make_discriminator_model() 9 | tf.keras.utils.plot_model(discriminator, show_shapes=True, dpi=64, to_file="discriminator.png") 10 | 11 | 12 | if __name__ == "__main__": 13 | main() 14 | -------------------------------------------------------------------------------- /gan/gan_mnist_pytorch/distributed.yaml: -------------------------------------------------------------------------------- 1 | name: gan_mnist_pytorch_distributed 2 | data: 3 | url: "https://s3-us-west-2.amazonaws.com/determined-ai-test-data/pytorch_mnist.tar.gz" 4 | hyperparameters: 5 | global_batch_size: 256 # per GPU batch size of 32 6 | lr: 0.0002 7 | b1: 0.5 8 | b2: 0.999 9 | latent_dim: 100 10 | searcher: 11 | name: single 12 | metric: loss 13 | max_length: 14 | batches: 5000 15 | smaller_is_better: True 16 | entrypoint: model_def:GANTrial 17 | resources: 18 | slots_per_trial: 8 19 | -------------------------------------------------------------------------------- /computer_vision/iris_tf_keras/const.yaml: -------------------------------------------------------------------------------- 1 | name: iris_tf_keras_const 2 | data: 3 | train_url: http://download.tensorflow.org/data/iris_training.csv 4 | test_url: http://download.tensorflow.org/data/iris_test.csv 5 | hyperparameters: 6 | learning_rate: 1.0e-4 7 | learning_rate_decay: 1.0e-6 8 | layer1_dense_size: 16 9 | global_batch_size: 30 10 | searcher: 11 | name: single 12 | metric: val_categorical_accuracy 13 | smaller_is_better: false 14 | max_length: 15 | batches: 5000 16 | entrypoint: model_def:IrisTrial 17 | -------------------------------------------------------------------------------- /features/checkpoint_hooks_pytorch/const.yaml: -------------------------------------------------------------------------------- 1 | name: mnist_pytorch_const 2 | data: 3 | url: https://s3-us-west-2.amazonaws.com/determined-ai-test-data/pytorch_mnist.tar.gz 4 | hyperparameters: 5 | learning_rate: 1.0 6 | global_batch_size: 64 7 | n_filters1: 32 8 | n_filters2: 64 9 | dropout1: 0.25 10 | dropout2: 0.5 11 | searcher: 12 | name: single 13 | metric: validation_loss 14 | max_length: 15 | batches: 937 #60,000 training images with batch size 64 16 | smaller_is_better: true 17 | entrypoint: model_def:MNistTrial 18 | -------------------------------------------------------------------------------- /computer_vision/cifar10_pytorch/const.yaml: -------------------------------------------------------------------------------- 1 | name: cifar10_pytorch_const 2 | description: An example experiment using Determined AI with CIFAR10 and PyTorch. 3 | hyperparameters: 4 | learning_rate: 1.0e-4 5 | learning_rate_decay: 1.0e-6 6 | layer1_dropout: 0.25 7 | layer2_dropout: 0.25 8 | layer3_dropout: 0.5 9 | global_batch_size: 32 10 | records_per_epoch: 50000 11 | searcher: 12 | name: single 13 | metric: validation_error 14 | max_length: 15 | epochs: 32 16 | entrypoint: model_def:CIFARTrial 17 | min_validation_period: 18 | epochs: 1 -------------------------------------------------------------------------------- /blog/tp/matmul_profiling.yaml: -------------------------------------------------------------------------------- 1 | name: Matmul Profiling 2 | # Adjust the workspace and project names, as appropriate. 3 | workspace: TP Blog Post 4 | project: Matmul Profiling 5 | resources: 6 | slots_per_trial: 1 7 | searcher: 8 | name: single 9 | metric: not_used 10 | max_length: 1 11 | hyperparameters: 12 | d_model_min: 256 13 | d_model_max: 16384 14 | d_model_step: 256 15 | num_warmups: 5 16 | num_repeats: 100 17 | entrypoint: >- 18 | python3 -m determined.launch.torch_distributed 19 | python3 matmul_profiling.py 20 | max_restarts: 0 21 | -------------------------------------------------------------------------------- /computer_vision/cifar10_tf_keras/const.yaml: -------------------------------------------------------------------------------- 1 | name: cifar10_tf_keras_const 2 | hyperparameters: 3 | learning_rate: 1.0e-4 4 | learning_rate_decay: 1.0e-6 5 | layer1_dropout: 0.25 6 | layer2_dropout: 0.25 7 | layer3_dropout: 0.5 8 | global_batch_size: 40 9 | width_factor: 0.1 10 | height_factor: 0.1 11 | horizontal_flip: True 12 | records_per_epoch: 50000 13 | searcher: 14 | name: single 15 | metric: val_categorical_error 16 | max_length: 17 | epochs: 32 18 | min_validation_period: 19 | epochs: 1 20 | entrypoint: model_def:CIFARTrial 21 | -------------------------------------------------------------------------------- /blog/tp/tp_profiling.yaml: -------------------------------------------------------------------------------- 1 | name: MLP TP Profiling 2 | # Adjust the workspace and project names, as appropriate. 3 | workspace: TP Blog Post 4 | project: MLP TP Profiling 5 | resources: 6 | slots_per_trial: 8 7 | searcher: 8 | name: single 9 | metric: not_used 10 | max_length: 1 11 | hyperparameters: 12 | batch_size: 1 13 | seq_len: 4096 14 | d_model_min: 1024 15 | d_model_max: 20480 16 | d_model_step: 512 17 | num_warmups: 5 18 | num_repeats: 100 19 | entrypoint: >- 20 | python3 -m determined.launch.torch_distributed 21 | python3 tp_profiling.py 22 | max_restarts: 0 23 | -------------------------------------------------------------------------------- /graphs/proteins_pytorch_geometric/const.yaml: -------------------------------------------------------------------------------- 1 | name: proteins_pytorch_geometric 2 | hyperparameters: 3 | global_batch_size: 60 4 | dataset: PROTEINS 5 | lr: 0.0005 6 | topk_pooling_ratio: 0.8 7 | dropout: 0.5 8 | training_records: 890 9 | records_per_epoch: 890 10 | min_validation_period: 11 | epochs: 1 12 | searcher: 13 | name: single 14 | metric: validation_loss 15 | max_length: 16 | epochs: 200 17 | smaller_is_better: true 18 | entrypoint: model_def:GraphConvTrial 19 | environment: 20 | image: 21 | cuda: determinedai/environments:cuda-11.3-pytorch-1.12-tf-2.11-gpu-2b7e2a1 22 | -------------------------------------------------------------------------------- /computer_vision/detectron2_coco_pytorch/const_fake.yaml: -------------------------------------------------------------------------------- 1 | name: detectron2_const_e2e_tests 2 | environment: 3 | image: "determinedai/example-detectron2:0.6-cuda-10.2-pytorch-1.10" 4 | environment_variables: 5 | - DETECTRON2_DATASETS=. 6 | hyperparameters: 7 | global_batch_size: 1 8 | model_yaml: mask_rcnn_R_50_FPN_noaug_1x.yaml 9 | output_dir: None 10 | fake_data: True 11 | searcher: 12 | name: single 13 | metric: bboxAP 14 | max_length: 15 | batches: 100 16 | smaller_is_better: false 17 | resources: 18 | slots_per_trial: 1 19 | entrypoint: model_def:DetectronTrial 20 | max_restarts: 0 21 | -------------------------------------------------------------------------------- /computer_vision/iris_tf_keras/distributed.yaml: -------------------------------------------------------------------------------- 1 | name: iris_tf_keras_distributed 2 | data: 3 | train_url: http://download.tensorflow.org/data/iris_training.csv 4 | test_url: http://download.tensorflow.org/data/iris_test.csv 5 | hyperparameters: 6 | learning_rate: 1.0e-4 7 | learning_rate_decay: 1.0e-6 8 | layer1_dense_size: 16 9 | global_batch_size: 30 10 | resources: 11 | slots_per_trial: 2 # Use 2 GPUs to train the model. 12 | searcher: 13 | name: single 14 | metric: val_categorical_accuracy 15 | smaller_is_better: false 16 | max_length: 17 | batches: 2500 18 | entrypoint: model_def:IrisTrial 19 | -------------------------------------------------------------------------------- /features/custom_reducers_mnist_pytorch/const.yaml: -------------------------------------------------------------------------------- 1 | name: custom_reducers_mnist_pytorch_const 2 | data: 3 | url: https://s3-us-west-2.amazonaws.com/determined-ai-test-data/pytorch_mnist.tar.gz 4 | hyperparameters: 5 | learning_rate: 1.0 6 | global_batch_size: 64 7 | n_filters1: 32 8 | n_filters2: 64 9 | dropout1: 0.25 10 | dropout2: 0.5 11 | searcher: 12 | name: single 13 | metric: validation_loss 14 | max_length: 15 | batches: 937 #60,000 training images with batch size 64 16 | smaller_is_better: true 17 | entrypoint: model_def:MNistTrial 18 | 19 | # Show off validation metrics. 20 | min_validation_period: 21 | batches: 100 22 | -------------------------------------------------------------------------------- /nlp/bert_glue_pytorch/const.yaml: -------------------------------------------------------------------------------- 1 | name: bert_glue_pytorch_const 2 | hyperparameters: 3 | global_batch_size: 24 4 | learning_rate: 2.0e-5 5 | lr_scheduler_epoch_freq: 1 6 | model_type: 'bert' 7 | adam_epsilon: 1.0e-8 8 | weight_decay: 0 9 | num_warmup_steps: 0 10 | num_training_steps: 459 11 | max_seq_length: 128 12 | searcher: 13 | name: single 14 | metric: acc 15 | max_length: 16 | batches: 400 17 | smaller_is_better: false 18 | data: 19 | task: 'MRPC' 20 | model_name_or_path: "bert-base-uncased" 21 | output_mode: "classification" 22 | path_to_mrpc: '' 23 | download_data: True 24 | entrypoint: model_def:BertPyTorch 25 | -------------------------------------------------------------------------------- /computer_vision/detectron2_coco_pytorch/mask_rcnn_R_50_FPN_noaug_1x.yaml: -------------------------------------------------------------------------------- 1 | _BASE_: "Base-RCNN-FPN.yaml" 2 | MODEL: 3 | WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" 4 | MASK_ON: True 5 | RESNETS: 6 | DEPTH: 50 7 | # Detectron1 uses smooth L1 loss with some magic beta values. 8 | # The defaults are changed to L1 loss in Detectron2. 9 | RPN: 10 | SMOOTH_L1_BETA: 0.1111 11 | ROI_BOX_HEAD: 12 | SMOOTH_L1_BETA: 1.0 13 | POOLER_SAMPLING_RATIO: 2 14 | POOLER_TYPE: "ROIAlign" 15 | ROI_MASK_HEAD: 16 | POOLER_SAMPLING_RATIO: 2 17 | POOLER_TYPE: "ROIAlign" 18 | INPUT: 19 | # no scale augmentation 20 | MIN_SIZE_TRAIN: (800, ) -------------------------------------------------------------------------------- /computer_vision/fasterrcnn_coco_pytorch/adaptive.yaml: -------------------------------------------------------------------------------- 1 | name: fasterrcnn_coco_pytorch_adaptive_search 2 | data: 3 | url: https://determined-ai-public-datasets.s3-us-west-2.amazonaws.com/PennFudanPed/PennFudanPed.zip 4 | hyperparameters: 5 | learning_rate: 6 | type: double 7 | minval: 0.0001 8 | maxval: 0.001 9 | momentum: 10 | type: double 11 | minval: 0.2 12 | maxval: 1.0 13 | weight_decay: 0.0005 14 | global_batch_size: 2 15 | searcher: 16 | name: adaptive_asha 17 | metric: val_avg_iou 18 | smaller_is_better: false 19 | max_length: 20 | batches: 800 21 | max_trials: 16 22 | entrypoint: model_def:ObjectDetectionTrial 23 | -------------------------------------------------------------------------------- /graphs/proteins_pytorch_geometric/distributed.yaml: -------------------------------------------------------------------------------- 1 | name: proteins_pytorch_geometric_distributed 2 | hyperparameters: 3 | global_batch_size: 60 4 | dataset: PROTEINS 5 | lr: 0.0005 6 | topk_pooling_ratio: 0.8 7 | dropout: 0.5 8 | training_records: 890 9 | records_per_epoch: 890 10 | min_validation_period: 11 | epochs: 1 12 | searcher: 13 | name: single 14 | metric: validation_loss 15 | max_length: 16 | epochs: 200 17 | smaller_is_better: true 18 | entrypoint: model_def:GraphConvTrial 19 | environment: 20 | image: 21 | cuda: determinedai/environments:cuda-11.3-pytorch-1.12-tf-2.11-gpu-2b7e2a1 22 | resources: 23 | slots_per_trial: 4 24 | -------------------------------------------------------------------------------- /features/ports_flask/README.md: -------------------------------------------------------------------------------- 1 | # Determined experiment spinning off a flask server 2 | 3 | This example includes two experiments: 4 | 5 | 1. `hello-server`, a flask-based "hello world" web app. 6 | 2. `hello-client`, which launches `hello-server`, waits for the server to stand up, makes a request to it, then kills it and shuts down. 7 | 8 | To launch this example: 9 | 10 | det e create hello-client.yaml . -f 11 | 12 | Upon successful completion, you should see the following in the experiment logs: 13 | 14 | Got server response: {'data': 'Hello World'} 15 | SUCCESS! 16 | Killed experiment 17 | hello-server is killed. 18 | -------------------------------------------------------------------------------- /computer_vision/cifar10_pytorch/distributed.yaml: -------------------------------------------------------------------------------- 1 | name: cifar10_pytorch_distributed 2 | description: An example experiment using Determined AI with CIFAR10, PyTorch and distributed multi-GPU training. 3 | hyperparameters: 4 | learning_rate: 1.0e-4 5 | learning_rate_decay: 1.0e-6 6 | layer1_dropout: 0.25 7 | layer2_dropout: 0.25 8 | layer3_dropout: 0.5 9 | global_batch_size: 512 # Per-GPU batch size of 32 10 | resources: 11 | slots_per_trial: 16 12 | records_per_epoch: 50000 13 | searcher: 14 | name: single 15 | metric: validation_error 16 | max_length: 17 | epochs: 32 18 | entrypoint: model_def:CIFARTrial 19 | min_validation_period: 20 | epochs: 1 21 | -------------------------------------------------------------------------------- /computer_vision/cifar10_tf_keras/distributed.yaml: -------------------------------------------------------------------------------- 1 | name: cifar10_tf_keras_distributed 2 | hyperparameters: 3 | learning_rate: 1.0e-4 4 | learning_rate_decay: 1.0e-6 5 | layer1_dropout: 0.25 6 | layer2_dropout: 0.25 7 | layer3_dropout: 0.5 8 | global_batch_size: 512 # Per-GPU batch size of 32 9 | width_factor: 0.1 10 | height_factor: 0.1 11 | horizontal_flip: True 12 | records_per_epoch: 50000 13 | resources: 14 | slots_per_trial: 16 # Use 16 GPUs to train the model. 15 | searcher: 16 | name: single 17 | metric: val_categorical_error 18 | max_length: 19 | epochs: 32 20 | min_validation_period: 21 | epochs: 1 22 | entrypoint: model_def:CIFARTrial 23 | -------------------------------------------------------------------------------- /blog/act-mem-2/README.md: -------------------------------------------------------------------------------- 1 | # Activation Memory: Part 2 2 | 3 | Code accompanying the deep-dive [blog post on activation memory](https://determined.ai/blog/act-mem-2). 4 | 5 | - The main utility code is in `act_mem.py`. 6 | - Basic transformer layers are implemented in `layers.py`. 7 | - The scripts `{block,mlp}_script.py` demonstrate how replacing `GELU` with `ReLU` affects activation 8 | memory. 9 | - `attn_script.py` shows the cost of activation memory in the attention layer. 10 | - Tests of the code are in `test.py`. 11 | - See `requirements.txt` for versions the code was built against. 12 | 13 | 14 | ## Contributors 15 | 16 | - [Garrett Goon](https://github.com/garrett361) -------------------------------------------------------------------------------- /features/checkpoint_hooks_pytorch/layers.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import torch 4 | from torch import nn 5 | 6 | from determined.pytorch import TorchData 7 | 8 | 9 | class Flatten(nn.Module): 10 | def forward(self, *args: TorchData, **kwargs: Any) -> torch.Tensor: 11 | assert len(args) == 1 12 | x = args[0] 13 | assert isinstance(x, torch.Tensor) 14 | return x.contiguous().view(x.size(0), -1) 15 | 16 | 17 | class Squeeze(nn.Module): 18 | def forward(self, *args: TorchData, **kwargs: Any) -> torch.Tensor: 19 | assert len(args) == 1 20 | x = args[0] 21 | assert isinstance(x, torch.Tensor) 22 | return torch.squeeze(x) 23 | -------------------------------------------------------------------------------- /fsdp/minimal_fsdp/config.yaml: -------------------------------------------------------------------------------- 1 | name: fsdp example 2 | entrypoint: python3 -m determined.launch.torch_distributed -- python3 fsdp.py 3 | searcher: 4 | name: single 5 | metric: loss 6 | max_length: 100 7 | resources: 8 | slots_per_trial: 2 9 | environment: 10 | image: 11 | gpu: determinedai/environments:cuda-11.8-pytorch-2.0-gpu-mpi-0.31.1 12 | hyperparameters: 13 | batch_size: 1 14 | lr: 1e-4 15 | d_model: 512 16 | max_seq_len: 2048 17 | n_heads: 8 18 | n_layers: 4 19 | vocab_size: 32000 20 | report_rate: 10 21 | checkpoint_rate: 50 22 | amp_dtype: float16 23 | validation_batches: 10 24 | core_api_profiler: false 25 | torch_profiler: false 26 | max_restarts: 0 27 | -------------------------------------------------------------------------------- /features/custom_reducers_mnist_pytorch/layers.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import torch 4 | from torch import nn 5 | 6 | from determined.pytorch import TorchData 7 | 8 | 9 | class Flatten(nn.Module): 10 | def forward(self, *args: TorchData, **kwargs: Any) -> torch.Tensor: 11 | assert len(args) == 1 12 | x = args[0] 13 | assert isinstance(x, torch.Tensor) 14 | return x.contiguous().view(x.size(0), -1) 15 | 16 | 17 | class Squeeze(nn.Module): 18 | def forward(self, *args: TorchData, **kwargs: Any) -> torch.Tensor: 19 | assert len(args) == 1 20 | x = args[0] 21 | assert isinstance(x, torch.Tensor) 22 | return torch.squeeze(x) 23 | -------------------------------------------------------------------------------- /features/hp_constraints_mnist_pytorch/layers.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import torch 4 | from torch import nn 5 | 6 | from determined.pytorch import TorchData 7 | 8 | 9 | class Flatten(nn.Module): 10 | def forward(self, *args: TorchData, **kwargs: Any) -> torch.Tensor: 11 | assert len(args) == 1 12 | x = args[0] 13 | assert isinstance(x, torch.Tensor) 14 | return x.contiguous().view(x.size(0), -1) 15 | 16 | 17 | class Squeeze(nn.Module): 18 | def forward(self, *args: TorchData, **kwargs: Any) -> torch.Tensor: 19 | assert len(args) == 1 20 | x = args[0] 21 | assert isinstance(x, torch.Tensor) 22 | return torch.squeeze(x) 23 | -------------------------------------------------------------------------------- /computer_vision/unets_tf_keras/const.yaml: -------------------------------------------------------------------------------- 1 | name: unets_tf_keras_const 2 | data: 3 | BUFFER_SIZE: 1000 4 | data_file: mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_128_no_top.h5 5 | 6 | hyperparameters: 7 | learning_rate: 1.0e-4 8 | learning_rate_decay: 1.0e-6 9 | layer1_dense_size: 16 10 | global_batch_size: 64 11 | OUTPUT_CHANNELS: 3 12 | 13 | searcher: 14 | name: single 15 | metric: val_accuracy 16 | smaller_is_better: false 17 | max_length: 18 | batches: 1140 19 | 20 | min_validation_period: 21 | batches: 57 22 | entrypoint: model_def:UNetsTrial 23 | scheduling_unit: 57 24 | environment: 25 | image: determinedai/environments:cuda-11.3-pytorch-1.12-tf-2.11-gpu-2b7e2a1 26 | -------------------------------------------------------------------------------- /custom_search_method/asha_search_method/experiment_files/layers.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import torch 4 | from torch import nn 5 | 6 | from determined.pytorch import TorchData 7 | 8 | 9 | class Flatten(nn.Module): 10 | def forward(self, *args: TorchData, **kwargs: Any) -> torch.Tensor: 11 | assert len(args) == 1 12 | x = args[0] 13 | assert isinstance(x, torch.Tensor) 14 | return x.contiguous().view(x.size(0), -1) 15 | 16 | 17 | class Squeeze(nn.Module): 18 | def forward(self, *args: TorchData, **kwargs: Any) -> torch.Tensor: 19 | assert len(args) == 1 20 | x = args[0] 21 | assert isinstance(x, torch.Tensor) 22 | return torch.squeeze(x) 23 | -------------------------------------------------------------------------------- /features/custom_reducers_mnist_pytorch/distributed.yaml: -------------------------------------------------------------------------------- 1 | name: custom_reducers_mnist_pytorch_distributed 2 | data: 3 | url: https://s3-us-west-2.amazonaws.com/determined-ai-test-data/pytorch_mnist.tar.gz 4 | hyperparameters: 5 | learning_rate: 1.0 6 | global_batch_size: 512 7 | n_filters1: 32 8 | n_filters2: 64 9 | dropout1: 0.25 10 | dropout2: 0.5 11 | resources: 12 | slots_per_trial: 8 13 | searcher: 14 | name: single 15 | metric: validation_loss 16 | max_length: 17 | batches: 117 #60,000 training images with batch size 512 (batch size 64 per GPU) 18 | smaller_is_better: true 19 | entrypoint: model_def:MNistTrial 20 | 21 | # Show off validation metrics. 22 | min_validation_period: 23 | batches: 100 24 | -------------------------------------------------------------------------------- /nlp/bert_glue_pytorch/distributed.yaml: -------------------------------------------------------------------------------- 1 | name: bert_glue_pytorch_distributed 2 | hyperparameters: 3 | global_batch_size: 192 # per gpu batch size of 24 4 | learning_rate: 2.0e-5 5 | lr_scheduler_epoch_freq: 1 6 | model_type: 'bert' 7 | adam_epsilon: 1.0e-8 8 | weight_decay: 0 9 | num_warmup_steps: 0 10 | num_training_steps: 459 11 | max_seq_length: 128 12 | searcher: 13 | name: single 14 | metric: acc 15 | max_length: 16 | batches: 50 17 | smaller_is_better: false 18 | resources: 19 | slots_per_trial: 8 20 | data: 21 | task: 'MRPC' 22 | model_name_or_path: "bert-base-uncased" 23 | output_mode: "classification" 24 | path_to_mrpc: '' 25 | download_data: True 26 | entrypoint: model_def:BertPyTorch 27 | -------------------------------------------------------------------------------- /computer_vision/detr_coco_pytorch/startup-hook.sh: -------------------------------------------------------------------------------- 1 | apt-get update 2 | apt-get install unzip 3 | 4 | wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip 5 | unzip -o annotations_trainval2017.zip 6 | mv annotations/instances_train2017.json /tmp 7 | mv annotations/instances_val2017.json /tmp 8 | 9 | git clone https://github.com/facebookresearch/detr.git 10 | cd detr && git reset --hard 4e1a9281bc5621dcd65f3438631de25e255c4269 11 | # Need to fix a bug in the original code that fails to handle torchvision version 0.10 correctly. 12 | sed -i 's/float(torchvision\.__version__\[:3\]) < 0.7/int(torchvision\.__version__.split("\.")\[1\]) < 7/g' util/misc.py 13 | cd .. 14 | 15 | pip install attrdict 16 | pip install pycocotools 17 | -------------------------------------------------------------------------------- /computer_vision/iris_tf_keras/adaptive.yaml: -------------------------------------------------------------------------------- 1 | name: iris_tf_keras_adaptive_search 2 | data: 3 | train_url: http://download.tensorflow.org/data/iris_training.csv 4 | test_url: http://download.tensorflow.org/data/iris_test.csv 5 | hyperparameters: 6 | learning_rate: 7 | type: log 8 | minval: -5.0 9 | maxval: 1.0 10 | base: 10.0 11 | learning_rate_decay: 1.0e-6 12 | layer1_dense_size: 13 | type: int 14 | minval: 4 15 | maxval: 32 16 | global_batch_size: 17 | type: int 18 | minval: 5 19 | maxval: 30 20 | searcher: 21 | name: adaptive_asha 22 | metric: val_categorical_accuracy 23 | smaller_is_better: false 24 | max_length: 25 | batches: 6400 26 | max_trials: 512 27 | entrypoint: model_def:IrisTrial 28 | -------------------------------------------------------------------------------- /gan/cyclegan/1-gpu.yaml: -------------------------------------------------------------------------------- 1 | description: Cycle GAN Pytorch 1 GPU 2 | data: 3 | downloaded_path: /tmp 4 | dataset_name: monet2photo 5 | n_cpu: 8 6 | img_height: 256 7 | img_width: 256 8 | channels: 3 9 | sample_interval: 3000 10 | hyperparameters: 11 | global_batch_size: 1 12 | lr: 0.0002 13 | b1: 0.5 14 | b2: 0.999 15 | decay_epoch: 100 # epoch from which to start lr decay 16 | n_residual_blocks: 9 # number of residual blocks in generator 17 | lambda_cyc: 10.0 18 | lambda_id: 5.0 19 | records_per_epoch: 6287 20 | searcher: 21 | name: single 22 | metric: loss_real_D 23 | max_length: 24 | epochs: 2000 25 | smaller_is_better: True 26 | entrypoint: determined_model_def:CycleGANTrial 27 | min_checkpoint_period: 28 | epochs: 1 29 | -------------------------------------------------------------------------------- /gan/pix2pix_tf_keras/const.yaml: -------------------------------------------------------------------------------- 1 | name: pix2pix_facades_const 2 | data: 3 | base: http://efrosgans.eecs.berkeley.edu/pix2pix/datasets 4 | dataset: facades 5 | BUFFER_SIZE: 400 6 | height: 256 7 | width: 256 8 | hyperparameters: 9 | global_batch_size: 1 10 | discriminator_lr: 2e-4 11 | discriminator_beta_1: 0.5 12 | generator_lr: 2e-4 13 | generator_beta_1: 0.5 14 | jitter: 30 15 | mirror: true 16 | records_per_epoch: 400 # There are 400 images in the facades training set 17 | min_validation_period: 18 | batches: 40 19 | min_checkpoint_period: 20 | batches: 400 21 | searcher: 22 | name: single 23 | metric: val_total_loss 24 | smaller_is_better: true 25 | max_length: 26 | batches: 4000 27 | entrypoint: model_def:Pix2PixTrial 28 | -------------------------------------------------------------------------------- /blog/python_sdk_demo/mednist_model/config.yaml: -------------------------------------------------------------------------------- 1 | hyperparameters: 2 | global_batch_size: 128 3 | lr: 0.001 4 | weight_decay: 5 | type: log 6 | base: 10 7 | minval: -4 8 | maxval: -1 9 | beta1: 10 | type: double 11 | minval: 0.1 12 | maxval: 0.999 13 | beta2: 14 | type: double 15 | minval: 0.1 16 | maxval: 0.999 17 | gamma: 0.1 18 | min_validation_period: 19 | epochs: 1 20 | searcher: 21 | name: adaptive_asha 22 | metric: val_accuracy 23 | smaller_is_better: false 24 | max_length: 25 | epochs: 1 26 | max_trials: 3 27 | mode: aggressive 28 | resources: 29 | slots_per_trial: 1 30 | entrypoint: model_def:MyMEDMnistTrial 31 | max_restarts: 0 32 | 33 | -------------------------------------------------------------------------------- /computer_vision/unets_tf_keras/distributed.yaml: -------------------------------------------------------------------------------- 1 | name: unets_tf_keras_distributed 2 | data: 3 | BUFFER_SIZE: 1000 4 | data_file: mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_128_no_top.h5 5 | 6 | hyperparameters: 7 | learning_rate: 1.0e-4 8 | learning_rate_decay: 1.0e-6 9 | layer1_dense_size: 16 10 | global_batch_size: 512 # per slot batch size = 64 11 | OUTPUT_CHANNELS: 3 12 | 13 | searcher: 14 | name: single 15 | metric: val_accuracy 16 | smaller_is_better: false 17 | max_length: 18 | batches: 160 19 | 20 | resources: 21 | slots_per_trial: 8 22 | 23 | min_validation_period: 24 | batches: 8 25 | scheduling_unit: 8 26 | entrypoint: model_def:UNetsTrial 27 | environment: 28 | image: determinedai/environments:cuda-11.3-pytorch-1.12-tf-2.11-gpu-2b7e2a1 29 | -------------------------------------------------------------------------------- /gan/cyclegan/8-gpus.yaml: -------------------------------------------------------------------------------- 1 | description: Cycle GAN Pytorch 8 GPUs 2 | data: 3 | downloaded_path: /tmp 4 | dataset_name: monet2photo 5 | n_cpu: 8 6 | img_height: 256 7 | img_width: 256 8 | channels: 3 9 | sample_interval: 3000 10 | hyperparameters: 11 | global_batch_size: 8 12 | lr: 0.0002 13 | b1: 0.5 14 | b2: 0.999 15 | decay_epoch: 100 # epoch from which to start lr decay 16 | n_residual_blocks: 9 # number of residual blocks in generator 17 | lambda_cyc: 10.0 18 | lambda_id: 5.0 19 | records_per_epoch: 6287 20 | searcher: 21 | name: single 22 | metric: loss_real_D 23 | max_length: 24 | epochs: 2000 25 | smaller_is_better: True 26 | entrypoint: determined_model_def:CycleGANTrial 27 | resources: 28 | slots_per_trial: 8 29 | min_checkpoint_period: 30 | epochs: 1 31 | -------------------------------------------------------------------------------- /gan/cyclegan/64-gpus.yaml: -------------------------------------------------------------------------------- 1 | description: Cycle GAN Pytorch 64 GPUs 2 | data: 3 | downloaded_path: /tmp 4 | dataset_name: monet2photo 5 | n_cpu: 8 6 | img_height: 256 7 | img_width: 256 8 | channels: 3 9 | sample_interval: 3000 10 | hyperparameters: 11 | global_batch_size: 64 12 | lr: 0.0002 13 | b1: 0.5 14 | b2: 0.999 15 | decay_epoch: 100 # epoch from which to start lr decay 16 | n_residual_blocks: 9 # number of residual blocks in generator 17 | lambda_cyc: 10.0 18 | lambda_id: 5.0 19 | records_per_epoch: 6287 20 | searcher: 21 | name: single 22 | metric: loss_real_D 23 | max_length: 24 | epochs: 2000 25 | smaller_is_better: True 26 | entrypoint: determined_model_def:CycleGANTrial 27 | resources: 28 | slots_per_trial: 64 29 | min_checkpoint_period: 30 | epochs: 1 31 | -------------------------------------------------------------------------------- /gan/pix2pix_tf_keras/distributed.yaml: -------------------------------------------------------------------------------- 1 | name: pix2pix_facades_distributed 2 | data: 3 | base: http://efrosgans.eecs.berkeley.edu/pix2pix/datasets 4 | dataset: facades 5 | BUFFER_SIZE: 400 6 | height: 256 7 | width: 256 8 | hyperparameters: 9 | global_batch_size: 4 10 | discriminator_lr: 2e-4 11 | discriminator_beta_1: 0.5 12 | generator_lr: 2e-4 13 | generator_beta_1: 0.5 14 | jitter: 30 15 | mirror: true 16 | records_per_epoch: 400 # There are 400 images in the facades training set 17 | min_validation_period: 18 | batches: 40 19 | min_checkpoint_period: 20 | batches: 400 21 | searcher: 22 | name: single 23 | metric: val_total_loss 24 | smaller_is_better: true 25 | max_length: 26 | batches: 4000 27 | entrypoint: model_def:Pix2PixTrial 28 | resources: 29 | slots_per_trial: 4 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # All log files 6 | *.log 7 | 8 | # Jupyter Notebook 9 | .ipynb_checkpoints 10 | 11 | # pyenv 12 | .python-version 13 | 14 | # dotenv 15 | .env 16 | 17 | # virtualenv 18 | .venv 19 | venv/ 20 | ENV/ 21 | 22 | # mypy 23 | .mypy_cache/ 24 | 25 | # Determined distributable package 26 | determined-*.tar.gz 27 | 28 | # All Python wheels 29 | *.whl 30 | 31 | # Node modules 32 | node_modules/ 33 | 34 | # VSCode 35 | .vscode/ 36 | 37 | # JetBrains IDEs (e.g., PyCharm and GoLand) 38 | .idea/ 39 | 40 | # gobin directory used for tests 41 | gobin 42 | 43 | # MacOS system files 44 | *.DS_Store 45 | .dccache 46 | 47 | # Hydra output 48 | model_hub/mmdetection/hydra/outputs 49 | 50 | build/ -------------------------------------------------------------------------------- /model_hub/mmdetection/hydra/configs/config.yaml: -------------------------------------------------------------------------------- 1 | name: model_hub_mmdet_experiment 2 | defaults: 3 | - data: disk 4 | - profiling: disabled 5 | - searcher: single 6 | - hyperparameters: 7 | - base 8 | 9 | checkpoint_storage: 10 | save_trial_latest: 5 11 | 12 | min_validation_period: 13 | batches: 7320 14 | 15 | environment: 16 | image: 17 | gpu: determinedai/model-hub-mmdetection:0.26.2-dev0 18 | environment_variables: 19 | - OMP_NUM_THREADS=1 # Following pytorch dtrain, this environment variable is set to 1 to avoid overloading the system. 20 | 21 | resources: 22 | slots_per_trial: 8 # max number of GPUs a trial is allowed to individually use 23 | shm_size: 200000000000 24 | entrypoint: python3 -m determined.launch.torch_distributed --trial model_hub.mmdetection:MMDetTrial 25 | -------------------------------------------------------------------------------- /computer_vision/detectron2_coco_pytorch/const.yaml: -------------------------------------------------------------------------------- 1 | name: detectron2_const 2 | environment: 3 | image: "determinedai/example-detectron2:0.6-cuda-10.2-pytorch-1.10" 4 | environment_variables: 5 | - DETECTRON2_DATASETS=/mnt/dtrain-fsx/detectron2 6 | hyperparameters: 7 | global_batch_size: 16 # Detectron defaults to 16 regardless of N GPUs 8 | model_yaml: mask_rcnn_R_50_FPN_noaug_1x.yaml 9 | output_dir: None 10 | fake_data: False 11 | searcher: 12 | name: single 13 | metric: bboxAP 14 | max_length: 15 | batches: 90000 16 | smaller_is_better: false 17 | resources: 18 | slots_per_trial: 1 19 | entrypoint: model_def:DetectronTrial 20 | bind_mounts: 21 | - host_path: /path/to/data 22 | container_path: /mnt/dtrain-fsx/detectron2 23 | read_only: true 24 | min_validation_period: 25 | batches: 5000 26 | -------------------------------------------------------------------------------- /computer_vision/byol_pytorch/backbone.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Callable 3 | 4 | import torch.nn as nn 5 | import torchvision.models as models 6 | 7 | 8 | @dataclass 9 | class BackboneMetadata: 10 | feature_size: int 11 | build_fn: Callable[[], nn.Module] 12 | 13 | 14 | BACKBONE_METADATA_BY_NAME = { 15 | "resnet18": BackboneMetadata( 16 | feature_size=512, build_fn=lambda: models.resnet18(pretrained=True) 17 | ), 18 | "resnet34": BackboneMetadata( 19 | feature_size=512, build_fn=lambda: models.resnet34(pretrained=True) 20 | ), 21 | "resnet50": BackboneMetadata( 22 | feature_size=2048, build_fn=lambda: models.resnet50(pretrained=True) 23 | ), 24 | "resnet101": BackboneMetadata( 25 | feature_size=2048, build_fn=lambda: models.resnet101(pretrained=True) 26 | ), 27 | } 28 | -------------------------------------------------------------------------------- /features/hp_constraints_mnist_pytorch/adaptive.yaml: -------------------------------------------------------------------------------- 1 | name: mnist_pytorch_constrained_adaptive 2 | data: 3 | url: https://s3-us-west-2.amazonaws.com/determined-ai-test-data/pytorch_mnist.tar.gz 4 | hyperparameters: 5 | global_batch_size: 64 6 | learning_rate: 7 | type: double 8 | minval: .0001 9 | maxval: 1.0 10 | n_filters1: 11 | type: int 12 | minval: 8 13 | maxval: 64 14 | n_filters2: 15 | type: int 16 | minval: 8 17 | maxval: 72 18 | dropout1: 19 | type: double 20 | minval: .2 21 | maxval: .8 22 | dropout2: 23 | type: double 24 | minval: .2 25 | maxval: .8 26 | searcher: 27 | name: adaptive_asha 28 | metric: validation_loss 29 | smaller_is_better: true 30 | max_trials: 16 31 | max_length: 32 | batches: 937 #60,000 training images with batch size 64 33 | entrypoint: model_def:MNistTrial 34 | -------------------------------------------------------------------------------- /computer_vision/detectron2_coco_pytorch/distributed.yaml: -------------------------------------------------------------------------------- 1 | name: detectron2_distributed 2 | environment: 3 | image: "determinedai/example-detectron2:0.6-cuda-10.2-pytorch-1.10" 4 | environment_variables: 5 | - DETECTRON2_DATASETS=/mnt/dtrain-fsx/detectron2 6 | hyperparameters: 7 | global_batch_size: 16 # Detectron defaults to 16 regardless of N GPUs 8 | model_yaml: mask_rcnn_R_50_FPN_noaug_1x.yaml 9 | output_dir: None 10 | fake_data: False 11 | searcher: 12 | name: single 13 | metric: bboxAP 14 | max_length: 15 | batches: 90000 16 | smaller_is_better: false 17 | resources: 18 | slots_per_trial: 4 19 | shm_size: 824600000000 20 | entrypoint: model_def:DetectronTrial 21 | bind_mounts: 22 | - host_path: /path/to/data 23 | container_path: /mnt/dtrain-fsx/detectron2 24 | read_only: true 25 | min_validation_period: 26 | batches: 5000 27 | -------------------------------------------------------------------------------- /meta_learning/protonet_omniglot_pytorch/fetch_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Source: https://github.com/alshedivat/meta-blocks/blob/master/benchmarks/omniglot/fetch_data.sh 4 | # Fetch Omniglot. 5 | # 6 | 7 | OMNIGLOT_URL=https://raw.githubusercontent.com/brendenlake/omniglot/master/python 8 | 9 | set -e 10 | 11 | mkdir tmp 12 | trap 'rm -r tmp' EXIT 13 | 14 | if [ ! -d data ]; then 15 | mkdir data 16 | fi 17 | 18 | if [ ! -d data/omniglot ]; then 19 | mkdir tmp/omniglot 20 | for name in images_background images_evaluation; do 21 | echo "Fetching omniglot/$name ..." 22 | curl -# "$OMNIGLOT_URL/$name.zip" >"tmp/$name.zip" 23 | echo "Extracting omniglot/$name ..." 24 | unzip -q "tmp/$name.zip" -d tmp 25 | rm "tmp/$name.zip" 26 | mv tmp/$name/* tmp/omniglot 27 | done 28 | mv tmp/omniglot data/omniglot 29 | fi 30 | -------------------------------------------------------------------------------- /blog/act-mem-2/attn_script.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | import act_mem 4 | import layers 5 | 6 | if __name__ == "__main__": 7 | batch_size, seq_len, d_model, n_heads = 2, 4096, 1024, 32 8 | dtype = torch.bfloat16 9 | inputs = torch.randn( 10 | batch_size, 11 | seq_len, 12 | d_model, 13 | device="cuda", 14 | requires_grad=True, 15 | dtype=dtype, 16 | ) 17 | 18 | attn = layers.Attention( 19 | d_model=d_model, 20 | n_heads=n_heads, 21 | device="cuda", 22 | dtype=dtype, 23 | ) 24 | with act_mem.AllocatedMemContext() as mem, act_mem.SavedTensorContext( 25 | ignored_tensors=attn.parameters() 26 | ) as saved: 27 | out = attn(inputs) 28 | print(f'{mem.delta["current"]=}') 29 | print(f"{saved.saved_tensor_mem=}") 30 | print(f"{saved.saved_tensor_mem/out.numel()=}") 31 | -------------------------------------------------------------------------------- /graphs/proteins_pytorch_geometric/adaptive.yaml: -------------------------------------------------------------------------------- 1 | name: proteins_pytorch_geometric_adaptive 2 | hyperparameters: 3 | global_batch_size: 4 | type: int 5 | minval: 16 6 | maxval: 128 7 | dataset: PROTEINS 8 | lr: 9 | type: log 10 | base: 10.0 11 | minval: -6 12 | maxval: -1 13 | topk_pooling_ratio: 14 | type: double 15 | minval: 0.1 16 | maxval: 0.9 17 | dropout: 18 | type: double 19 | minval: 0.2 20 | maxval: 0.8 21 | training_records: 890 22 | records_per_epoch: 890 23 | min_validation_period: 24 | epochs: 1 25 | searcher: 26 | name: adaptive_asha 27 | metric: validation_loss 28 | max_length: 29 | epochs: 200 30 | smaller_is_better: true 31 | max_trials: 1000 32 | entrypoint: model_def:GraphConvTrial 33 | environment: 34 | image: 35 | cuda: determinedai/environments:cuda-11.3-pytorch-1.12-tf-2.11-gpu-2b7e2a1 36 | -------------------------------------------------------------------------------- /blog/llm-finetuning-3/chat_format.py: -------------------------------------------------------------------------------- 1 | CHAT_ML_TEMPLATE = """ 2 | {% for message in messages %} 3 | {% if message['role'] == 'user' %} 4 | {{'<|im_start|>user\n' + message['content'].strip() + '<|im_end|>' }} 5 | {% elif message['role'] == 'system' %} 6 | {{'<|im_start|>system\n' + message['content'].strip() + '<|im_end|>' }} 7 | {% elif message['role'] == 'assistant' %} 8 | {{'<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }} 9 | {% endif %} 10 | {% endfor %} 11 | """ 12 | 13 | CHAT_ML_END_TURN_TOKEN = "<|im_end|>" 14 | CHAT_ML_START_TURN_TOKEN = "<|im_start|>" 15 | 16 | 17 | def get_assistant_prompt(): 18 | return "<|im_start|>assistant\n" 19 | 20 | 21 | def get_response_template_ids(tokenizer): 22 | return tokenizer.encode(get_assistant_prompt(), add_special_tokens=False) 23 | 24 | 25 | def maybe_add_generation_prompt(text: str) -> str: 26 | return text + get_assistant_prompt() 27 | -------------------------------------------------------------------------------- /computer_vision/cifar10_pytorch/adaptive.yaml: -------------------------------------------------------------------------------- 1 | name: cifar10_pytorch_adaptive_search 2 | description: An example experiment of hyperparameter tuning using Determined AI with CIFAR10 and PyTorch. 3 | hyperparameters: 4 | learning_rate: 5 | type: log 6 | minval: -5.0 7 | maxval: 1.0 8 | base: 10.0 9 | learning_rate_decay: 1.0e-6 10 | layer1_dropout: 11 | type: double 12 | minval: 0.2 13 | maxval: 0.5 14 | layer2_dropout: 15 | type: double 16 | minval: 0.2 17 | maxval: 0.5 18 | layer3_dropout: 19 | type: double 20 | minval: 0.2 21 | maxval: 0.5 22 | global_batch_size: 23 | type: int 24 | minval: 16 25 | maxval: 64 26 | records_per_epoch: 50000 27 | searcher: 28 | name: adaptive_asha 29 | metric: validation_error 30 | max_length: 31 | epochs: 32 32 | max_trials: 16 33 | entrypoint: model_def:CIFARTrial 34 | min_validation_period: 35 | epochs: 1 36 | -------------------------------------------------------------------------------- /blog/llm-finetuning/distributed.yaml: -------------------------------------------------------------------------------- 1 | name: Text-to-SQL 2 | debug: false 3 | environment: 4 | environment_variables: 5 | - NCCL_DEBUG=INFO 6 | resources: 7 | slots_per_trial: 2 8 | searcher: 9 | name: single 10 | max_length: 11 | batches: 5000 12 | metric: eval_accuracy 13 | smaller_is_better: false 14 | hyperparameters: 15 | model: "TinyLlama/TinyLlama-1.1B-Chat-v0.4" 16 | dataset_subset: "easy" 17 | training_args: 18 | output_dir: "/tmp/llm_finetuning" 19 | max_steps: 5000 20 | per_device_train_batch_size: 1 21 | per_device_eval_batch_size: 4 22 | fp16: true 23 | evaluation_strategy: "steps" 24 | eval_steps: 1000 25 | logging_strategy: "steps" 26 | logging_steps: 100 27 | save_strategy: "steps" 28 | save_steps: 1000 29 | learning_rate: 1e-5 30 | entrypoint: >- 31 | python -m determined.launch.torch_distributed 32 | python finetune.py 33 | max_restarts: 0 34 | -------------------------------------------------------------------------------- /nlp/bert_glue_pytorch/constants.py: -------------------------------------------------------------------------------- 1 | from transformers import ( 2 | BertConfig, 3 | BertForSequenceClassification, 4 | BertTokenizer, 5 | DistilBertConfig, 6 | DistilBertForSequenceClassification, 7 | DistilBertTokenizer, 8 | RobertaConfig, 9 | RobertaForSequenceClassification, 10 | RobertaTokenizer, 11 | XLMConfig, 12 | XLMForSequenceClassification, 13 | XLMTokenizer, 14 | XLNetConfig, 15 | XLNetForSequenceClassification, 16 | XLNetTokenizer, 17 | ) 18 | 19 | # Lookup for classes 20 | MODEL_CLASSES = { 21 | "bert": (BertConfig, BertForSequenceClassification, BertTokenizer), 22 | "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), 23 | "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer), 24 | "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), 25 | "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer), 26 | } 27 | -------------------------------------------------------------------------------- /computer_vision/cifar10_pytorch/distributed_inference.yaml: -------------------------------------------------------------------------------- 1 | name: cifar10_pytorch_distributed_inference 2 | description: An example using Determined AI with CIFAR10, PyTorch and distributed batch inference. 3 | entrypoint: >- 4 | python3 -m determined.launch.torch_distributed 5 | python3 inference_example.py 6 | resources: 7 | slots_per_trial: 2 8 | searcher: 9 | name: grid 10 | metric: x 11 | max_length: 100 12 | hyperparameters: 13 | # Replace with the name of the model to run inference on 14 | model_name: cifar_checkpoints 15 | # Replace with the model versions to run inference on 16 | model_version: 17 | type: categorical 18 | vals: 19 | - 1 20 | - 2 21 | - 3 22 | - 4 23 | - 5 24 | - 6 25 | - 7 26 | - 8 27 | - 9 28 | - 10 29 | - 11 30 | - 12 31 | - 13 32 | - 14 33 | max_restarts: 0 34 | bind_mounts: 35 | - host_path: /tmp 36 | container_path: /tmp 37 | read_only: false 38 | -------------------------------------------------------------------------------- /deepspeed/deepspeed_dcgan/mnist.yaml: -------------------------------------------------------------------------------- 1 | name: dcgan_deepspeed_mnist 2 | data: 3 | dataroot: /data 4 | dataset: mnist 5 | image_size: 64 6 | hyperparameters: 7 | deepspeed_config: ds_config.json 8 | noise_length: 100 9 | generator_width_base: 64 10 | discriminator_width_base: 64 11 | data_workers: 16 12 | environment: 13 | environment_variables: 14 | - NCCL_DEBUG=INFO 15 | # You may need to modify this to match your network configuration. 16 | - NCCL_SOCKET_IFNAME=ens,eth,ib 17 | image: 18 | gpu: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.22.1 19 | bind_mounts: 20 | - host_path: /tmp 21 | container_path: /data 22 | resources: 23 | slots_per_trial: 2 24 | searcher: 25 | name: single 26 | metric: no_validation_metric 27 | max_length: 28 | batches: 100000 29 | min_validation_period: 30 | batches: 0 31 | entrypoint: 32 | - python3 33 | - -m 34 | - determined.launch.deepspeed 35 | - --trial 36 | - model_def:DCGANTrial 37 | max_restarts: 0 38 | -------------------------------------------------------------------------------- /nas/gaea_pytorch/search/optimizer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.optim.optimizer import Optimizer, required 3 | 4 | 5 | class EG(Optimizer): 6 | def __init__(self, params, lr=required, normalize_fn=lambda x: x): 7 | if lr is not required and lr < 0.0: 8 | raise ValueError("Invalid learning rate: {}".format(lr)) 9 | self.normalize_fn = normalize_fn 10 | defaults = dict(lr=lr) 11 | super(EG, self).__init__(params, defaults) 12 | 13 | @torch.no_grad() 14 | def step(self, closure=None): 15 | loss = None 16 | if closure is not None: 17 | with torch.enable_grad(): 18 | loss = closure() 19 | 20 | for group in self.param_groups: 21 | for p in group["params"]: 22 | if p.grad is None: 23 | continue 24 | d_p = p.grad 25 | p.mul_(torch.exp(-group["lr"] * d_p)) 26 | p.data = self.normalize_fn(p.data) 27 | 28 | return loss 29 | -------------------------------------------------------------------------------- /nlp/word_language_model/const.yaml: -------------------------------------------------------------------------------- 1 | name: word_language_modeling_const 2 | hyperparameters: 3 | global_batch_size: 20 4 | eval_batch_size: 10 5 | max_grad_norm: 0.25 6 | model_cls: Transformer 7 | # model_cls: LSTM 8 | # model_cls: GRU 9 | word_embeddings_size: 200 10 | num_hidden: 200 11 | num_layers: 2 12 | dropout: 0.2 13 | bptt: 35 14 | lr: 20 15 | # Transformer Model Only Hyperparameters 16 | num_heads: 2 17 | # LSTM/GRU Model Only Hyperparameters 18 | # tied: False 19 | resources: 20 | slots_per_trial: 1 21 | records_per_epoch: 59660 22 | searcher: 23 | name: single 24 | metric: validation_loss 25 | max_length: 26 | epochs: 40 27 | smaller_is_better: true 28 | min_validation_period: 29 | epochs: 1 30 | data: 31 | use_bind_mount: True 32 | bind_mount_path: /data 33 | use_cache: True 34 | entrypoint: model_def:WordLanguageModelTrial 35 | bind_mounts: 36 | - host_path: /tmp 37 | container_path: /data 38 | read_only: false -------------------------------------------------------------------------------- /nas/gaea_pytorch/search/const.yaml: -------------------------------------------------------------------------------- 1 | name: gaea_search 2 | 3 | data: 4 | download_dir: /data 5 | 6 | bind_mounts: 7 | - host_path: /tmp 8 | container_path: /data 9 | read_only: false 10 | 11 | hyperparameters: 12 | # Number of classes in dataset 13 | n_classes: 10 14 | # Channel shuffle factor. 1 / shuffle_factor channels are activated at a given time. 15 | shuffle_factor: 4 16 | global_batch_size: 256 17 | learning_rate: 0.1 18 | momentum: 0.9 19 | min_learning_rate: 0 20 | scheduler_epochs: 50 21 | weight_decay: 3.0e-4 22 | arch_learning_rate: 0.1 23 | init_channels: 16 24 | layers: 8 25 | nodes: 4 26 | 27 | resources: 28 | slots_per_trial: 2 29 | 30 | min_validation_period: 31 | batches: 100 32 | 33 | records_per_epoch: 25000 34 | searcher: 35 | name: single 36 | metric: top1_accuracy 37 | smaller_is_better: false 38 | max_length: 39 | epochs: 50 40 | 41 | optimizations: 42 | aggregation_frequency: 1 43 | 44 | entrypoint: model_def:GAEASearchTrial 45 | -------------------------------------------------------------------------------- /nlp/word_language_model/distributed.yaml: -------------------------------------------------------------------------------- 1 | name: word_language_modeling_distributed 2 | hyperparameters: 3 | global_batch_size: 50 4 | eval_batch_size: 10 5 | max_grad_norm: 0.25 6 | model_cls: Transformer 7 | # model_cls: LSTM 8 | # model_cls: GRU 9 | word_embeddings_size: 200 10 | num_hidden: 200 11 | num_layers: 2 12 | dropout: 0.2 13 | bptt: 35 14 | lr: 20 15 | # Transformer Model Only Hyperparameters 16 | num_heads: 2 17 | # LSTM/GRU Model Only Hyperparameters 18 | # tied: False 19 | resources: 20 | slots_per_trial: 8 21 | records_per_epoch: 59660 22 | searcher: 23 | name: single 24 | metric: validation_loss 25 | max_length: 26 | epochs: 40 27 | smaller_is_better: true 28 | min_validation_period: 29 | epochs: 1 30 | data: 31 | use_bind_mount: True 32 | bind_mount_path: /data 33 | use_cache: True 34 | entrypoint: model_def:WordLanguageModelTrial 35 | bind_mounts: 36 | - host_path: /tmp 37 | container_path: /data 38 | read_only: false -------------------------------------------------------------------------------- /deepspeed/cifar10_cpu_offloading/ds_config_no_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 128, 3 | "steps_per_print": 10, 4 | "optimizer": { 5 | "type": "Adam", 6 | "params": { 7 | "lr": 0.001, 8 | "betas": [ 9 | 0.8, 10 | 0.999 11 | ], 12 | "eps": 1e-8, 13 | "weight_decay": 3e-7 14 | } 15 | }, 16 | "scheduler": { 17 | "type": "WarmupLR", 18 | "params": { 19 | "warmup_min_lr": 0, 20 | "warmup_max_lr": 0.001, 21 | "warmup_num_steps": 1000 22 | } 23 | }, 24 | "zero_optimization": { 25 | "stage": 0, 26 | "allgather_partitions": true, 27 | "allgather_bucket_size": 5e8, 28 | "overlap_comm": true, 29 | "reduce_scatter": true, 30 | "reduce_bucket_size": 5e8, 31 | "contiguous_gradients": true 32 | }, 33 | "gradient_clipping": 1.0, 34 | "fp16": { 35 | "enabled": true, 36 | "loss_scale": 0, 37 | "initial_scale_power": 5, 38 | "loss_scale_window": 1000, 39 | "hysteresis": 2, 40 | "min_loss_scale": 1 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /computer_vision/detectron2_coco_pytorch/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM determinedai/environments:cuda-10.2-base-gpu-0.20.1 2 | 3 | RUN pip install tensorboard cmake onnx # cmake from apt-get is too old 4 | RUN pip install torch==1.10 torchvision==0.11.1 -f https://download.pytorch.org/whl/cu101/torch_stable.html 5 | 6 | RUN pip install 'git+https://github.com/facebookresearch/fvcore' 7 | # install detectron2 8 | RUN git clone https://github.com/facebookresearch/detectron2 detectron2_repo 9 | # set FORCE_CUDA because during `docker build` cuda is not accessible 10 | ENV FORCE_CUDA="1" 11 | # This will by default build detectron2 for all common cuda architectures and take a lot more time, 12 | # because inside `docker build`, there is no way to tell which architecture will be used. 13 | #ARG TORCH_CUDA_ARCH_LIST="Kepler;Kepler+Tesla;Maxwell;Maxwell+Tegra;Pascal;Volta;Turing" 14 | ARG TORCH_CUDA_ARCH_LIST="Kepler;Kepler+Tesla" 15 | ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}" 16 | 17 | RUN pip install -e detectron2_repo 18 | 19 | RUN pip install horovod==0.24.2 20 | 21 | -------------------------------------------------------------------------------- /gan/dcgan_tf_keras/data.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def get_train_dataset(worker_rank: int): 5 | (train_images, _), (_, _) = tf.keras.datasets.mnist.load_data(path=f"mnist-{worker_rank}.npz") 6 | 7 | train_images = train_images.reshape(train_images.shape[0], 28, 28, 1).astype("float32") 8 | train_images = (train_images - 127.5) / 127.5 # Normalize the images to [-1, 1] 9 | 10 | # Batch and shuffle the data 11 | train_dataset = tf.data.Dataset.from_tensor_slices(train_images).shuffle(50000) 12 | return train_dataset 13 | 14 | 15 | def get_validation_dataset(worker_rank: int): 16 | (_, _), (test_images, _) = tf.keras.datasets.mnist.load_data(path=f"mnist-{worker_rank}.npz") 17 | 18 | test_images = test_images.reshape(test_images.shape[0], 28, 28, 1).astype("float32") 19 | test_images = (test_images - 127.5) / 127.5 # Normalize the images to [-1, 1] 20 | 21 | # Batch and shuffle the data 22 | train_dataset = tf.data.Dataset.from_tensor_slices(test_images).shuffle(50000) 23 | return train_dataset 24 | -------------------------------------------------------------------------------- /blog/llm-finetuning-2/deepspeed.yaml: -------------------------------------------------------------------------------- 1 | name: mistral deepspeed easy 2 | debug: false 3 | environment: 4 | environment_variables: 5 | - NCCL_DEBUG=INFO 6 | image: determinedai/environments:cuda-11.8-pytorch-2.0-gpu-95c7a14 7 | resources: 8 | slots_per_trial: 2 9 | searcher: 10 | name: single 11 | max_length: 12 | batches: 5000 13 | metric: eval_accuracy 14 | smaller_is_better: false 15 | hyperparameters: 16 | model: "mistralai/Mistral-7B-Instruct-v0.2" 17 | dataset_subset: "easy" 18 | lora: false 19 | training_args: 20 | output_dir: "/tmp/llm_finetuning" 21 | max_steps: 5000 22 | per_device_train_batch_size: 2 23 | per_device_eval_batch_size: 4 24 | bf16: true 25 | evaluation_strategy: "steps" 26 | eval_steps: 1000 27 | logging_strategy: "steps" 28 | logging_steps: 100 29 | save_strategy: "steps" 30 | save_steps: 5000 31 | learning_rate: 1e-5 32 | deepspeed: "ds_configs/ds_config_stage_3.json" 33 | entrypoint: >- 34 | python -m determined.launch.deepspeed 35 | python finetune.py 36 | max_restarts: 0 -------------------------------------------------------------------------------- /gan/gan_mnist_pytorch/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch MNIST GAN Example 2 | 3 | This example demonstrates how to build a simple GAN on the MNIST dataset using 4 | Determined's PyTorch API. This example is adapted from this [PyTorch Lightning GAN 5 | example](https://github.com/Lightning-AI/pytorch-lightning/blob/master/examples/pytorch/domain_templates/generative_adversarial_net.py). 6 | 7 | ## Files 8 | * **model_def.py**: The core code for the model. This includes building and compiling the model. 9 | * **data.py**: The data loading and preparation code for the model. 10 | 11 | ### Configuration Files 12 | * **const.yaml**: Train the model with constant hyperparameter values. 13 | * **distributed.yaml**: Same as const.yaml, but instead uses multiple GPUs (distributed training). 14 | 15 | ## To Run 16 | Installation instructions can be found under `docs/install-admin.html` or at [Determined installation page](https://docs.determined.ai/latest/index.html). 17 | After configuring the settings in `const.yaml`, run the following command: `det -m experiment create -f const.yaml . ` 18 | -------------------------------------------------------------------------------- /blog/llm-finetuning-3/ds_configs/ds_config_stage_2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "optimizer": { 11 | "type": "AdamW", 12 | "params": { 13 | "lr": "auto", 14 | "betas": "auto", 15 | "eps": "auto", 16 | "weight_decay": "auto" 17 | } 18 | }, 19 | "zero_optimization": { 20 | "stage": 2, 21 | "allgather_partitions": true, 22 | "allgather_bucket_size": 2e8, 23 | "overlap_comm": true, 24 | "reduce_scatter": true, 25 | "reduce_bucket_size": 2e8, 26 | "contiguous_gradients": true 27 | }, 28 | "gradient_accumulation_steps": "auto", 29 | "gradient_clipping": "auto", 30 | "train_batch_size": "auto", 31 | "train_micro_batch_size_per_gpu": "auto", 32 | "flops_profiler": { 33 | "enabled": false, 34 | "profile_step": 1, 35 | "module_depth": -1, 36 | "top_modules": 1, 37 | "detailed": true, 38 | "output_file": null 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /blog/llm-finetuning-2/lora.yaml: -------------------------------------------------------------------------------- 1 | name: mistral lora easy 2 | debug: false 3 | environment: 4 | environment_variables: 5 | - NCCL_DEBUG=INFO 6 | image: 7 | gpu: determinedai/environments:cuda-11.8-pytorch-2.0-gpu-95c7a14 8 | cpu: determinedai/environments:py-3.10-pytorch-2.0-cpu-03ae7d7 9 | resources: 10 | slots_per_trial: 2 11 | searcher: 12 | name: single 13 | max_length: 14 | batches: 5000 15 | metric: eval_accuracy 16 | smaller_is_better: false 17 | hyperparameters: 18 | model: "mistralai/Mistral-7B-Instruct-v0.2" 19 | dataset_subset: "easy" 20 | lora: true 21 | training_args: 22 | output_dir: "/tmp/llm_finetuning" 23 | max_steps: 5000 24 | per_device_train_batch_size: 8 25 | per_device_eval_batch_size: 4 26 | bf16: true 27 | evaluation_strategy: "steps" 28 | eval_steps: 1000 29 | logging_strategy: "steps" 30 | logging_steps: 100 31 | save_strategy: "steps" 32 | save_steps: 1000 33 | learning_rate: 1e-5 34 | entrypoint: >- 35 | python -m determined.launch.torch_distributed 36 | python finetune.py 37 | max_restarts: 0 -------------------------------------------------------------------------------- /deepspeed/deepspeed_dcgan/mnist_grad_accum.yaml: -------------------------------------------------------------------------------- 1 | name: dcgan_deepspeed_mnist_grad_accum 2 | data: 3 | dataroot: /data 4 | dataset: mnist 5 | image_size: 64 6 | hyperparameters: 7 | deepspeed_config: ds_config.json 8 | noise_length: 100 9 | generator_width_base: 64 10 | discriminator_width_base: 64 11 | data_workers: 16 12 | overwrite_deepspeed_args: 13 | gradient_accumulation_steps: 4 14 | environment: 15 | environment_variables: 16 | - NCCL_DEBUG=INFO 17 | # You may need to modify this to match your network configuration. 18 | - NCCL_SOCKET_IFNAME=ens,eth,ib 19 | image: 20 | gpu: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.22.1 21 | bind_mounts: 22 | - host_path: /tmp 23 | container_path: /data 24 | resources: 25 | slots_per_trial: 2 26 | searcher: 27 | name: single 28 | metric: no_validation_metric 29 | max_length: 30 | batches: 100000 31 | min_validation_period: 32 | batches: 0 33 | entrypoint: 34 | - python3 35 | - -m 36 | - determined.launch.deepspeed 37 | - --trial 38 | - model_def:DCGANTrial 39 | max_restarts: 0 40 | -------------------------------------------------------------------------------- /blog/llm-finetuning-3/ds_configs/ds_config_stage_3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 3, 24 | "overlap_comm": true, 25 | "contiguous_gradients": true, 26 | "sub_group_size": 1e9, 27 | "reduce_bucket_size": "auto", 28 | "stage3_prefetch_bucket_size": "auto", 29 | "stage3_param_persistence_threshold": "auto", 30 | "stage3_max_live_parameters": 1e9, 31 | "stage3_max_reuse_distance": 1e9, 32 | "stage3_gather_16bit_weights_on_model_save": true 33 | }, 34 | "gradient_accumulation_steps": "auto", 35 | "gradient_clipping": "auto", 36 | "train_batch_size": "auto", 37 | "train_micro_batch_size_per_gpu": "auto" 38 | } 39 | -------------------------------------------------------------------------------- /deepspeed/deepspeed_dcgan/cifar10_zero2.yaml: -------------------------------------------------------------------------------- 1 | name: dcgan_deepspeed_cifar10 2 | data: 3 | dataroot: /data 4 | dataset: cifar10 5 | image_size: 64 6 | hyperparameters: 7 | deepspeed_config: ds_config.json 8 | noise_length: 100 9 | generator_width_base: 64 10 | discriminator_width_base: 64 11 | data_workers: 16 12 | overwrite_deepspeed_args: 13 | zero_optimization.stage: 2 14 | fp16.enabled: true 15 | environment: 16 | environment_variables: 17 | - NCCL_DEBUG=INFO 18 | # You may need to modify this to match your network configuration. 19 | - NCCL_SOCKET_IFNAME=ens,eth,ib 20 | image: 21 | gpu: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.22.1 22 | bind_mounts: 23 | - host_path: /tmp 24 | container_path: /data 25 | resources: 26 | slots_per_trial: 2 27 | searcher: 28 | name: single 29 | metric: no_validation_metric 30 | max_length: 31 | batches: 100000 32 | min_validation_period: 33 | batches: 0 34 | entrypoint: 35 | - python3 36 | - -m 37 | - determined.launch.deepspeed 38 | - --trial 39 | - model_def:DCGANTrial 40 | max_restarts: 0 41 | -------------------------------------------------------------------------------- /deepspeed/cifar10_cpu_offloading/zero_3_cpu_offload.yaml: -------------------------------------------------------------------------------- 1 | name: No OOM error 2 | debug: false 3 | #profiling: 4 | # enabled: true 5 | # begin_on_batch: 1 6 | # end_after_batch: 10 7 | # sync_timings: false 8 | hyperparameters: 9 | deepspeed_config: ds_config_offload.json 10 | deepspeed_offload: true 11 | environment: 12 | environment_variables: 13 | - NCCL_DEBUG=INFO 14 | # You may need to modify this to match your network configuration. 15 | - NCCL_SOCKET_IFNAME=ens,eth,ib 16 | image: 17 | gpu: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.22.1 18 | bind_mounts: 19 | - host_path: /tmp 20 | container_path: /data 21 | - host_path: /tmp 22 | container_path: /root/.cache 23 | resources: 24 | slots_per_trial: 2 25 | records_per_epoch: 5000 26 | searcher: 27 | name: single 28 | metric: accuracy 29 | smaller_is_better: false 30 | max_length: 31 | epochs: 1 32 | entrypoint: 33 | - python3 34 | - -m 35 | - determined.launch.deepspeed 36 | - --trial 37 | - model_def:CIFARTrial 38 | checkpoint_policy: none 39 | max_restarts: 0 40 | scheduling_unit: 2000 41 | -------------------------------------------------------------------------------- /deepspeed/cifar10_cpu_offloading/zero_no_offload.yaml: -------------------------------------------------------------------------------- 1 | name: OOM error 2 | debug: false 3 | #profiling: 4 | # enabled: true 5 | # begin_on_batch: 1 6 | # end_after_batch: 1000 7 | # sync_timings: false 8 | hyperparameters: 9 | deepspeed_config: ds_config_no_offload.json 10 | deepspeed_offload: false 11 | environment: 12 | environment_variables: 13 | - NCCL_DEBUG=INFO 14 | # You may need to modify this to match your network configuration. 15 | - NCCL_SOCKET_IFNAME=ens,eth,ib 16 | image: 17 | gpu: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.22.1 18 | bind_mounts: 19 | - host_path: /tmp 20 | container_path: /data 21 | - host_path: /tmp 22 | container_path: /root/.cache 23 | resources: 24 | slots_per_trial: 2 25 | records_per_epoch: 5000 26 | searcher: 27 | name: single 28 | metric: accuracy 29 | smaller_is_better: false 30 | max_length: 31 | epochs: 1 32 | entrypoint: 33 | - python3 34 | - -m 35 | - determined.launch.deepspeed 36 | - --trial 37 | - model_def:CIFARTrial 38 | checkpoint_policy: none 39 | max_restarts: 0 40 | scheduling_unit: 2000 41 | -------------------------------------------------------------------------------- /gan/pix2pix_tf_keras/adaptive.yaml: -------------------------------------------------------------------------------- 1 | name: pix2pix_facades_adaptive_asha 2 | data: 3 | base: http://efrosgans.eecs.berkeley.edu/pix2pix/datasets 4 | dataset: facades 5 | BUFFER_SIZE: 400 6 | height: 256 7 | width: 256 8 | hyperparameters: 9 | global_batch_size: 1 10 | discriminator_lr: 11 | type: log 12 | base: 10 13 | minval: -5 14 | maxval: -4 15 | discriminator_beta_1: 16 | type: log 17 | base: 10 18 | minval: -1 19 | maxval: 0 20 | generator_lr: 21 | type: log 22 | base: 10 23 | minval: -5 24 | maxval: -4 25 | generator_beta_1: 26 | type: log 27 | base: 10 28 | minval: -1 29 | maxval: 0 30 | jitter: 31 | type: int 32 | minval: 0 33 | maxval: 30 34 | mirror: true 35 | records_per_epoch: 400 # There are 400 images in the facades training set 36 | min_validation_period: 37 | batches: 40 38 | min_checkpoint_period: 39 | batches: 400 40 | searcher: 41 | name: adaptive_asha 42 | metric: val_total_loss 43 | smaller_is_better: true 44 | max_length: 45 | batches: 4000 46 | max_trials: 50 47 | entrypoint: model_def:Pix2PixTrial 48 | -------------------------------------------------------------------------------- /blog/tp/test_dot_product_local.py: -------------------------------------------------------------------------------- 1 | """ 2 | Demonstrating the equivalence of a basic dot product with intermediate activation function and a 3 | sharded-version of the same calculation. 4 | """ 5 | 6 | import torch 7 | 8 | D_MODEL = 128 9 | RANKS = 4 10 | 11 | if __name__ == "__main__": 12 | a = torch.randn(D_MODEL) 13 | b = torch.randn(D_MODEL) 14 | 15 | act_fn = torch.nn.GELU() 16 | # The dot-product, different ways 17 | dot_0 = a @ act_fn(b) 18 | dot_1 = (a * act_fn(b)).sum() 19 | dot_2 = torch.einsum("i, i", a, act_fn(b)) 20 | 21 | a_sharded = a.reshape(RANKS, D_MODEL // RANKS) 22 | b_sharded = b.reshape(RANKS, D_MODEL // RANKS) 23 | 24 | # More equivalent dot-products, using the sharded tensors. 25 | dot_3 = (a_sharded * act_fn(b_sharded)).sum() 26 | dot_4 = (a_sharded @ act_fn(b_sharded).T).trace() 27 | dot_5 = (a_sharded.T @ act_fn(b_sharded)).trace() 28 | dot_6 = torch.einsum("ij, ij", a_sharded, act_fn(b_sharded)) 29 | 30 | for dot_prod in (dot_1, dot_2, dot_3, dot_4, dot_5, dot_6): 31 | torch.testing.assert_close(dot_0, dot_prod) 32 | print("Correct results") 33 | -------------------------------------------------------------------------------- /computer_vision/byol_pytorch/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Dict, TypeVar 2 | 3 | import torch.nn as nn 4 | 5 | A = TypeVar("A") 6 | B = TypeVar("B") 7 | 8 | 9 | def merge_dicts(d1: Dict[A, B], d2: Dict[A, B], f: Callable[[B, B], B]) -> Dict[A, B]: 10 | """ 11 | Merges dictionaries with a custom merge function. 12 | E.g. if k in d1 and k in d2, result[k] == f(d1[k], d2[k]). 13 | Otherwise, if e.g. k is in only d1, result[k] == d1[k] 14 | """ 15 | d1_keys = d1.keys() 16 | d2_keys = d2.keys() 17 | shared = d1_keys & d2_keys 18 | d1_exclusive = d1_keys - d2_keys 19 | d2_exclusive = d2_keys - d1_keys 20 | new_dict = {k: f(d1[k], d2[k]) for k in shared} 21 | new_dict.update({k: d1[k] for k in d1_exclusive}) 22 | new_dict.update({k: d2[k] for k in d2_exclusive}) 23 | return new_dict 24 | 25 | 26 | class LambdaModule(nn.Module): 27 | """ 28 | Wrap a lambda as an nn.Module. 29 | """ 30 | 31 | def __init__(self, lam: Callable) -> None: 32 | super().__init__() 33 | self.lam = lam 34 | 35 | def forward(self, x: Any) -> Any: 36 | return self.lam(x) 37 | -------------------------------------------------------------------------------- /computer_vision/cifar10_tf_keras/adaptive.yaml: -------------------------------------------------------------------------------- 1 | name: cifar10_tf_keras_adaptive_search 2 | data: 3 | url: https://s3-us-west-2.amazonaws.com/determined-ai-datasets/cifar10/cifar-10-python.tar.gz 4 | hyperparameters: 5 | learning_rate: 6 | type: log 7 | minval: -5.0 8 | maxval: 1.0 9 | base: 10.0 10 | learning_rate_decay: 1.0e-6 11 | layer1_dropout: 12 | type: double 13 | minval: 0.2 14 | maxval: 0.5 15 | layer2_dropout: 16 | type: double 17 | minval: 0.2 18 | maxval: 0.5 19 | layer3_dropout: 20 | type: double 21 | minval: 0.2 22 | maxval: 0.5 23 | global_batch_size: 24 | type: int 25 | minval: 16 26 | maxval: 64 27 | width_shift_range: 28 | type: double 29 | minval: 0.0 30 | maxval: 0.2 31 | height_shift_range: 32 | type: double 33 | minval: 0.0 34 | maxval: 0.2 35 | horizontal_flip: 36 | type: categorical 37 | vals: 38 | - True 39 | - False 40 | records_per_epoch: 50000 41 | searcher: 42 | name: adaptive_asha 43 | mode: aggressive 44 | metric: val_categorical_error 45 | max_length: 46 | epochs: 32 47 | max_trials: 16 48 | entrypoint: model_def:CIFARTrial 49 | -------------------------------------------------------------------------------- /deepspeed/cifar10_moe/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 16, 3 | "steps_per_print": 2000, 4 | "optimizer": { 5 | "type": "Adam", 6 | "params": { 7 | "lr": 0.001, 8 | "betas": [ 9 | 0.8, 10 | 0.999 11 | ], 12 | "eps": 1e-8, 13 | "weight_decay": 3e-7 14 | } 15 | }, 16 | "scheduler": { 17 | "type": "WarmupLR", 18 | "params": { 19 | "warmup_min_lr": 0, 20 | "warmup_max_lr": 0.001, 21 | "warmup_num_steps": 1000 22 | } 23 | }, 24 | "gradient_clipping": 1.0, 25 | "prescale_gradients": false, 26 | "fp16": { 27 | "enabled": true, 28 | "fp16_master_weights_and_grads": false, 29 | "loss_scale": 0, 30 | "loss_scale_window": 500, 31 | "hysteresis": 2, 32 | "min_loss_scale": 1, 33 | "initial_scale_power": 15 34 | }, 35 | "wall_clock_breakdown": false, 36 | "zero_optimization": { 37 | "stage": 0, 38 | "allgather_partitions": true, 39 | "reduce_scatter": true, 40 | "allgather_bucket_size": 50000000, 41 | "reduce_bucket_size": 50000000, 42 | "overlap_comm": true, 43 | "contiguous_gradients": true, 44 | "cpu_offload": false 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /computer_vision/byol_pytorch/evaluate_result.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from determined.experimental import client 4 | 5 | if __name__ == "__main__": 6 | parser = ArgumentParser( 7 | description="Start an evaluation run (w/ classifier training) from the top checkpoint of a given experiment." 8 | ) 9 | parser.add_argument("--experiment-id", type=int, required=True) 10 | parser.add_argument("--classifier-train-epochs", type=int, default=80) 11 | args = parser.parse_args() 12 | exp = client.get_experiment(args.experiment_id) 13 | config = dict(exp.get_config()) 14 | print(sorted(list(config.keys()))) 15 | config["name"] = config["name"] + "_evaluation" 16 | config["min_validation_period"] = {"epochs": args.classifier_train_epochs} 17 | config["searcher"]["max_length"]["epochs"] = args.classifier_train_epochs 18 | config["hyperparameters"]["training_mode"] = "CLASSIFIER_ONLY" 19 | config["hyperparameters"]["validate_with_classifier"] = True 20 | config["searcher"]["source_checkpoint_uuid"] = exp.top_checkpoint().uuid 21 | config["searcher"]["metric"] = "test_accuracy" 22 | config["searcher"]["smaller_is_better"] = False 23 | client.create_experiment(config, ".") 24 | -------------------------------------------------------------------------------- /gan/cyclegan/startup-hook.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | apt install unzip 4 | 5 | FILE=monet2photo 6 | TMP_DIR=/tmp 7 | 8 | if [[ $FILE != "ae_photos" && $FILE != "apple2orange" && $FILE != "summer2winter_yosemite" && $FILE != "horse2zebra" && $FILE != "monet2photo" && $FILE != "cezanne2photo" && $FILE != "ukiyoe2photo" && $FILE != "vangogh2photo" && $FILE != "maps" && $FILE != "cityscapes" && $FILE != "facades" && $FILE != "iphone2dslr_flower" && $FILE != "ae_photos" ]]; then 9 | echo "Available datasets are: apple2orange, summer2winter_yosemite, horse2zebra, monet2photo, cezanne2photo, ukiyoe2photo, vangogh2photo, maps, cityscapes, facades, iphone2dslr_flower, ae_photos" 10 | exit 1 11 | fi 12 | 13 | URL=https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets/$FILE.zip 14 | ZIP_FILE=$TMP_DIR/$FILE.zip 15 | TARGET_DIR=$TMP_DIR/$FILE 16 | wget --no-verbose -N $URL -O $ZIP_FILE 17 | unzip -q $ZIP_FILE -d $TMP_DIR 18 | rm $ZIP_FILE 19 | 20 | # Adapt to project expected directory heriarchy 21 | mkdir -p "$TARGET_DIR/train" "$TARGET_DIR/test" 22 | mv "$TARGET_DIR/trainA" "$TARGET_DIR/train/A" 23 | mv "$TARGET_DIR/trainB" "$TARGET_DIR/train/B" 24 | mv "$TARGET_DIR/testA" "$TARGET_DIR/test/A" 25 | mv "$TARGET_DIR/testB" "$TARGET_DIR/test/B" 26 | -------------------------------------------------------------------------------- /deepspeed/cifar10_moe/moe.yaml: -------------------------------------------------------------------------------- 1 | name: cifar10_moe_deepspeed 2 | debug: false 3 | hyperparameters: 4 | deepspeed_config: ds_config.json 5 | moe: true 6 | num_experts: 7 | - 2 8 | ep_world_size: 2 9 | mlp_type: standard 10 | top_k: 1 11 | min_capacity: 0 12 | noisy_gate_policy: RSample 13 | moe_param_group: true 14 | 15 | environment: 16 | environment_variables: 17 | - NCCL_DEBUG=INFO 18 | # You may need to modify this to match your network configuration. 19 | - NCCL_SOCKET_IFNAME=ens,eth,ib 20 | # - CUDA_LAUNCH_BLOCKING=1 21 | # - NCCL_BLOCKING_WAIT=1 22 | # - NCCL_IB_DISABLE=1 23 | image: 24 | gpu: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.22.1 25 | bind_mounts: 26 | - host_path: /tmp 27 | container_path: /data 28 | - host_path: /tmp 29 | container_path: /root/.cache 30 | resources: 31 | slots_per_trial: 2 32 | records_per_epoch: 50000 33 | searcher: 34 | name: single 35 | metric: accuracy 36 | smaller_is_better: false 37 | max_length: 38 | epochs: 2 39 | entrypoint: 40 | - python3 41 | - -m 42 | - determined.launch.deepspeed 43 | - --trial 44 | - model_def:CIFARTrial 45 | checkpoint_policy: none 46 | max_restarts: 0 47 | scheduling_unit: 2000 48 | -------------------------------------------------------------------------------- /blog/llm-finetuning/chat_format.py: -------------------------------------------------------------------------------- 1 | CHAT_ML_TEMPLATE = """ 2 | {% for message in messages %} 3 | {% if message['role'] == 'user' %} 4 | {{'<|im_start|>user\n' + message['content'].strip() + '<|im_end|>' }} 5 | {% elif message['role'] == 'system' %} 6 | {{'<|im_start|>system\n' + message['content'].strip() + '<|im_end|>' }} 7 | {% elif message['role'] == 'assistant' %} 8 | {{'<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }} 9 | {% endif %} 10 | {% endfor %} 11 | """ 12 | 13 | ASSISTANT_PROMPT = "<|im_start|>assistant\n" 14 | 15 | EOS_TOKEN = "<|im_end|>" 16 | 17 | 18 | def get_chat_format(element): 19 | system_prompt = ( 20 | "You are a helpful programmer assistant that excels at SQL. " 21 | "When prompted with a task and a definition of an SQL table, you " 22 | "respond with a SQL query to retrieve information from the table. " 23 | "Don't explain your reasoning, only provide the SQL query." 24 | ) 25 | user_prompt = "Task: {instruction}\nSQL table: {input}\nSQL query: " 26 | 27 | return [ 28 | {"role": "system", "content": system_prompt}, 29 | {"role": "user", "content": user_prompt.format_map(element)}, 30 | {"role": "assistant", "content": element["response"]}, 31 | ] 32 | -------------------------------------------------------------------------------- /model_hub/mmdetection/hydra/mmdet_experiment.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import hydra 4 | from omegaconf import DictConfig, MissingMandatoryValue, OmegaConf 5 | 6 | from determined.common.experimental import Determined 7 | 8 | CONTEXT_DIR = os.getcwd() 9 | 10 | 11 | def check_for_missing(cfg): 12 | if isinstance(cfg, dict): 13 | for k, item in cfg.items(): 14 | if item == "???": 15 | raise MissingMandatoryValue(f"Missing mandatory value for {k}.") 16 | check_for_missing(item) 17 | elif isinstance(cfg, list): 18 | for item in cfg: 19 | check_for_missing(item) 20 | 21 | 22 | @hydra.main(config_path="./configs", config_name="config") 23 | def my_experiment(cfg: DictConfig) -> None: 24 | config = OmegaConf.to_container(cfg, resolve=True) 25 | # We use a helper function now to check for missing values. 26 | # In the next version of omegaconf, we will be able to check for missing values by 27 | # passing throw_on_missing to the OmegaConf.to_container call above. 28 | check_for_missing(config) 29 | 30 | master = Determined() 31 | exp = master.create_experiment(config, CONTEXT_DIR) 32 | exp.activate() 33 | 34 | 35 | if __name__ == "__main__": 36 | my_experiment() 37 | -------------------------------------------------------------------------------- /features/custom_reducers_mnist_pytorch/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch Custom Reducers (MNIST) 2 | This tutorial shows how to use custom reducers with PyTorch. In this example, 3 | the custom reducer is a per-class F1 score. 4 | 5 | This example is based on Determined's `mnist_pytorch` tutorial, with the custom 6 | reducer as the only modification. 7 | 8 | ## Files 9 | * **model_def.py**: Where the custom reducer is defined and used. 10 | * All other files are identical to the `mnist_pytorch` tutorial code. 11 | 12 | ## To Run 13 | If you have not yet installed Determined, installation instructions can be found 14 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html 15 | 16 | Run the following command: `det -m experiment create -f 17 | const.yaml .`. The other configurations can be run by specifying the appropriate 18 | configuration file in place of `const.yaml`. 19 | 20 | ## Results 21 | You should see the per-class F1 scores in the Determined WebUI and while 22 | viewing the tensorboard results for the experiment. The remaining metrics 23 | should match the behvaior of the `mnist_pytorch` tutorial. 24 | 25 | The custom reducers should work whether you run a single-slot experiment or a 26 | multi-slot experiment with distributed training. 27 | -------------------------------------------------------------------------------- /blog/llm-finetuning-2/ds_configs/ds_config_stage_1.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "optimizer": { 11 | "type": "AdamW", 12 | "params": { 13 | "lr": "auto", 14 | "betas": "auto", 15 | "eps": "auto", 16 | "weight_decay": "auto" 17 | } 18 | }, 19 | "scheduler": { 20 | "type": "WarmupLR", 21 | "params": { 22 | "warmup_min_lr": "auto", 23 | "warmup_max_lr": "auto", 24 | "warmup_num_steps": "auto" 25 | } 26 | }, 27 | "zero_optimization": { 28 | "stage": 1, 29 | "allgather_partitions": true, 30 | "allgather_bucket_size": 2e8, 31 | "overlap_comm": true, 32 | "reduce_scatter": true, 33 | "reduce_bucket_size": 2e8, 34 | "contiguous_gradients": true 35 | }, 36 | "gradient_accumulation_steps": "auto", 37 | "gradient_clipping": "auto", 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "flops_profiler": { 41 | "enabled": true, 42 | "profile_step": 1, 43 | "module_depth": -1, 44 | "top_modules": 1, 45 | "detailed": true, 46 | "output_file": null 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /blog/llm-finetuning-2/ds_configs/ds_config_stage_2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "optimizer": { 11 | "type": "AdamW", 12 | "params": { 13 | "lr": "auto", 14 | "betas": "auto", 15 | "eps": "auto", 16 | "weight_decay": "auto" 17 | } 18 | }, 19 | "scheduler": { 20 | "type": "WarmupLR", 21 | "params": { 22 | "warmup_min_lr": "auto", 23 | "warmup_max_lr": "auto", 24 | "warmup_num_steps": "auto" 25 | } 26 | }, 27 | "zero_optimization": { 28 | "stage": 2, 29 | "allgather_partitions": true, 30 | "allgather_bucket_size": 2e8, 31 | "overlap_comm": true, 32 | "reduce_scatter": true, 33 | "reduce_bucket_size": 2e8, 34 | "contiguous_gradients": true 35 | }, 36 | "gradient_accumulation_steps": "auto", 37 | "gradient_clipping": "auto", 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "flops_profiler": { 41 | "enabled": true, 42 | "profile_step": 1, 43 | "module_depth": -1, 44 | "top_modules": 1, 45 | "detailed": true, 46 | "output_file": null 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /blog/llm-finetuning-3/ds_configs/ds_config_stage_1.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "optimizer": { 11 | "type": "AdamW", 12 | "params": { 13 | "lr": "auto", 14 | "betas": "auto", 15 | "eps": "auto", 16 | "weight_decay": "auto" 17 | } 18 | }, 19 | "scheduler": { 20 | "type": "WarmupLR", 21 | "params": { 22 | "warmup_min_lr": "auto", 23 | "warmup_max_lr": "auto", 24 | "warmup_num_steps": "auto" 25 | } 26 | }, 27 | "zero_optimization": { 28 | "stage": 1, 29 | "allgather_partitions": true, 30 | "allgather_bucket_size": 2e8, 31 | "overlap_comm": true, 32 | "reduce_scatter": true, 33 | "reduce_bucket_size": 2e8, 34 | "contiguous_gradients": true 35 | }, 36 | "gradient_accumulation_steps": "auto", 37 | "gradient_clipping": "auto", 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "flops_profiler": { 41 | "enabled": true, 42 | "profile_step": 1, 43 | "module_depth": -1, 44 | "top_modules": 1, 45 | "detailed": true, 46 | "output_file": null 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /blog/lora-parameters/README.md: -------------------------------------------------------------------------------- 1 | # Finding the best LoRA parameters 2 | 3 | We finetune [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) using [LoRA](https://arxiv.org/abs/2106.09685) and [DeepSpeed](https://github.com/microsoft/DeepSpeed). We ran LoRA on two 40 GB A100 GPUs utilizing DeepSpeed. 4 | 5 | See our [blog post](https://www.determined.ai/blog/lora-parameters) for our experiment results. 6 | 7 | To get started, first install Determined on your local machine: 8 | ```bash 9 | pip install determined 10 | ``` 11 | 12 | Then finetune with LoRA: 13 | ```bash 14 | det e create lora.yaml . 15 | ``` 16 | 17 | You can view the actual training code in `finetune.py`. 18 | 19 | 20 | ## Configuration 21 | 22 | Change configuration options in `lora.yaml`. Some important options are: 23 | - `slots_per_trial`: the number of GPUs to use. 24 | - `dataset_subset`: the difficulty subset to train on. 25 | - `per_device_train_batch_size`: the batch size per GPU. 26 | 27 | 28 | DeepSpeed configuration files are in the `ds_configs` folder. 29 | 30 | 31 | ## Contributors 32 | 33 | - By [Sze Wai Yuen](https://github.com/szewaiyuen6) 34 | - Built on `llm-finetuning` code by [Agnieszka Ciborowska](https://github.com/aciborowska) and [Kevin Musgrave](https://github.com/KevinMusgrave). -------------------------------------------------------------------------------- /nlp/albert_squad_pytorch/distributed_8gpu.yaml: -------------------------------------------------------------------------------- 1 | # After 2 epochs, model should hit 85.76/88.87 F1/EM 2 | name: ALBert_SQuAD_PyTorch_8gpu 3 | hyperparameters: 4 | global_batch_size: 16 5 | learning_rate: 5.0e-5 6 | model_type: 'albert' 7 | do_lower_case: true 8 | adam_epsilon: 1.0e-8 9 | weight_decay: 0 10 | num_warmup_steps: 1620 11 | max_seq_length: 384 12 | doc_stride: 128 13 | max_query_length: 64 14 | n_best_size: 20 15 | max_answer_length: 30 16 | null_score_diff_threshold: 0.0 17 | max_grad_norm: 1.0 18 | num_training_steps: 16500 # This is the number of optimizer steps. Train for 2 epochs 19 | use_radam: false 20 | resources: 21 | slots_per_trial: 8 22 | searcher: 23 | name: single 24 | metric: f1 25 | max_length: 26 | records: 264396 27 | smaller_is_better: false 28 | min_validation_period: 29 | records: 80000 30 | data: 31 | pretrained_model_name: "albert-xxlarge-v2" 32 | use_bind_mount: True 33 | bind_mount_path: /mnt/data 34 | task: "SQuAD2.0" # SQuaD 2.0 has 132198 example. 35 | entrypoint: model_def:AlbertSQuADPyTorch 36 | optimizations: 37 | aggregation_frequency: 3 38 | bind_mounts: 39 | - host_path: /tmp/ 40 | container_path: /mnt/data 41 | read_only: false 42 | -------------------------------------------------------------------------------- /nlp/albert_squad_pytorch/distributed_64gpu.yaml: -------------------------------------------------------------------------------- 1 | # After 2 epochs, model should hit 86.24/89.06 F1/EM 2 | name: ALBert_SQuAD_PyTorch_64gpu 3 | hyperparameters: 4 | global_batch_size: 128 5 | learning_rate: 0.0002 6 | model_type: 'albert' 7 | do_lower_case: true 8 | adam_epsilon: 1.0e-8 9 | weight_decay: 0 10 | num_warmup_steps: 206 11 | max_seq_length: 384 12 | doc_stride: 128 13 | max_query_length: 64 14 | n_best_size: 20 15 | max_answer_length: 30 16 | null_score_diff_threshold: 0.0 17 | max_grad_norm: 1.0 18 | num_training_steps: 2064 # This is the number of optimizer steps. Train for 2 epochs 19 | use_radam: true 20 | resources: 21 | slots_per_trial: 64 22 | searcher: 23 | name: single 24 | metric: f1 25 | max_length: 26 | records: 264396 27 | smaller_is_better: false 28 | min_validation_period: 29 | records: 100000 30 | data: 31 | pretrained_model_name: "albert-xxlarge-v2" 32 | use_bind_mount: True 33 | bind_mount_path: /mnt/data 34 | task: "SQuAD2.0" # SQuaD 2.0 has 132198 example. 35 | entrypoint: model_def:AlbertSQuADPyTorch 36 | optimizations: 37 | aggregation_frequency: 2 38 | bind_mounts: 39 | - host_path: /tmp/ 40 | container_path: /mnt/data 41 | read_only: false 42 | -------------------------------------------------------------------------------- /meta_learning/protonet_omniglot_pytorch/20way1shot.yaml: -------------------------------------------------------------------------------- 1 | name: omniglot_protonet 2 | data: 3 | data_path: ./data 4 | validation_portion: 0.25 5 | tasks_per_epoch_train: 100 6 | tasks_per_epoch_val: 1000 7 | train_workers: 8 8 | val_workers: 4 9 | 10 | hyperparameters: 11 | learning_rate: 1.0e-3 12 | weight_decay: 0 13 | reduce_every: 200 14 | lr_gamma: 0.5 15 | global_batch_size: 2 # how many tasks to train before performing a meta-update 16 | val_batch_size: 2 # how many tasks to evaluate on 17 | # Meta-training 18 | num_classes_train: 60 19 | num_support_train: 1 20 | num_query_train: 5 21 | # Meta-test 22 | num_classes_val: 20 #n-way 23 | num_support_val: 1 #k-shot 24 | # Model 25 | img_resize_dim: 28 # input will be 1 x img_resize_dim x img_resize_dim 26 | hidden_dim: 64 # intermediate number of channels 27 | embedding_dim: 64 # embedding number of channels 28 | 29 | resources: 30 | slots_per_trial: 2 31 | 32 | searcher: 33 | name: single 34 | metric: loss 35 | smaller_is_better: true 36 | # Original paper trained for 10,000 epochs with a plateau stopping condition 37 | max_length: 38 | batches: 30000 39 | 40 | entrypoint: model_def:OmniglotProtoNetTrial 41 | min_validation_period: 42 | batches: 5000 43 | checkpoint_policy: none 44 | -------------------------------------------------------------------------------- /meta_learning/protonet_omniglot_pytorch/20way5shot.yaml: -------------------------------------------------------------------------------- 1 | name: omniglot_protonet 2 | data: 3 | data_path: ./data 4 | validation_portion: 0.25 5 | tasks_per_epoch_train: 100 6 | tasks_per_epoch_val: 1000 7 | train_workers: 8 8 | val_workers: 4 9 | 10 | hyperparameters: 11 | learning_rate: 1.0e-3 12 | weight_decay: 0 13 | reduce_every: 200 14 | lr_gamma: 0.5 15 | global_batch_size: 2 # how many tasks to train before performing a meta-update 16 | val_batch_size: 2 # how many tasks to evaluate on 17 | # Meta-training 18 | num_classes_train: 60 19 | num_support_train: 5 20 | num_query_train: 5 21 | # Meta-test 22 | num_classes_val: 20 #n-way 23 | num_support_val: 5 #k-shot 24 | # Model 25 | img_resize_dim: 28 # input will be 1 x img_resize_dim x img_resize_dim 26 | hidden_dim: 64 # intermediate number of channels 27 | embedding_dim: 64 # embedding number of channels 28 | 29 | resources: 30 | slots_per_trial: 2 31 | 32 | searcher: 33 | name: single 34 | metric: loss 35 | smaller_is_better: true 36 | # Original paper trained for 10,000 epochs with a plateau stopping condition 37 | max_length: 38 | batches: 30000 39 | 40 | entrypoint: model_def:OmniglotProtoNetTrial 41 | min_validation_period: 42 | batches: 5000 43 | checkpoint_policy: none 44 | -------------------------------------------------------------------------------- /nlp/albert_squad_pytorch/const.yaml: -------------------------------------------------------------------------------- 1 | # After 2 epochs, model should hit 85.76/88.87 F1/EM 2 | name: ALBert_SQuAD_PyTorch_1gpu 3 | hyperparameters: 4 | global_batch_size: 2 5 | learning_rate: 5.0e-5 6 | model_type: 'albert' 7 | adam_epsilon: 1.0e-8 8 | weight_decay: 0 9 | num_warmup_steps: 13220 # 10% of total training 10 | max_seq_length: 384 11 | doc_stride: 128 12 | max_query_length: 64 13 | n_best_size: 20 14 | max_answer_length: 30 15 | null_score_diff_threshold: 0.0 16 | max_grad_norm: 1.0 17 | num_training_steps: 132198 # This is the number of optimizer steps. Train for 2 epochs 18 | do_lower_case: true 19 | use_radam: false 20 | resources: 21 | slots_per_trial: 1 22 | searcher: 23 | name: single 24 | metric: f1 25 | max_length: 26 | records: 264396 27 | smaller_is_better: false 28 | min_validation_period: 29 | records: 80000 30 | data: 31 | pretrained_model_name: "albert-xxlarge-v2" 32 | use_bind_mount: True 33 | bind_mount_path: /mnt/data 34 | task: "SQuAD2.0" # SQuaD 2.0 has 132198 example. 35 | entrypoint: model_def:AlbertSQuADPyTorch 36 | optimizations: 37 | aggregation_frequency: 24 38 | bind_mounts: 39 | - host_path: /tmp/ 40 | container_path: /mnt/data 41 | read_only: false 42 | -------------------------------------------------------------------------------- /deepspeed/pipeline_parallelism/README.md: -------------------------------------------------------------------------------- 1 | # DeepSpeed CIFAR Example 2 | This example is adapted from the 3 | [pipeline parallelism example in the DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/training/pipeline_parallelism) 4 | repository. It is intended to demonstrate a simple usecase of DeepSpeed's PipelineEngine with Determined. 5 | 6 | ## Files 7 | * **model_def.py**: The core code for the model. This includes building and compiling the model. 8 | * **alexnet.py**: Specifies the AlexNet architecture. 9 | 10 | ### Configuration Files 11 | * **ds_config.json**: The DeepSpeed config file. 12 | * **distributed.yaml**: Determined config to train the model with 2-stage pipeline parallelism. 13 | 14 | ## Data 15 | The CIFAR-10 dataset is downloaded from https://www.cs.toronto.edu/~kriz/cifar.html. 16 | 17 | ## To Run 18 | If you have not yet installed Determined, installation instructions can be found 19 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html 20 | 21 | Run the following command: 22 | ``` 23 | det experiment create distributed.yaml . 24 | ``` 25 | 26 | ## Results 27 | Training the model with the hyperparameter settings in `distributed.yaml` on 2 28 | NVidia Tesla V100s on a single node should yield a throughput of at least 800 samples/sec. 29 | -------------------------------------------------------------------------------- /deepspeed/cifar10_moe/zero_stages.yaml: -------------------------------------------------------------------------------- 1 | name: cifar10_zero_deepspeed 2 | debug: false 3 | hyperparameters: 4 | deepspeed_config: ds_config.json 5 | moe: false 6 | num_experts: 7 | - 2 8 | ep_world_size: 2 9 | mlp_type: standard 10 | top_k: 1 11 | min_capacity: 0 12 | noisy_gate_policy: RSample 13 | moe_param_group: true 14 | overwrite_deepspeed_args: 15 | zero_optimization.stage: 2 16 | environment: 17 | environment_variables: 18 | - NCCL_DEBUG=INFO 19 | # You may need to modify this to match your network configuration. 20 | - NCCL_SOCKET_IFNAME=ens,eth,ib 21 | # - CUDA_LAUNCH_BLOCKING=1 22 | # - NCCL_BLOCKING_WAIT=1 23 | # - NCCL_IB_DISABLE=1 24 | image: 25 | gpu: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.22.1 26 | bind_mounts: 27 | - host_path: /tmp 28 | container_path: /data 29 | - host_path: /tmp 30 | container_path: /root/.cache 31 | resources: 32 | slots_per_trial: 2 33 | records_per_epoch: 50000 34 | searcher: 35 | name: single 36 | metric: accuracy 37 | smaller_is_better: false 38 | max_length: 39 | epochs: 2 40 | entrypoint: 41 | - python3 42 | - -m 43 | - determined.launch.deepspeed 44 | - --trial 45 | - model_def:CIFARTrial 46 | checkpoint_policy: none 47 | max_restarts: 0 48 | scheduling_unit: 2000 49 | -------------------------------------------------------------------------------- /blog/llm-finetuning-2/ds_configs/ds_config_stage_3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupDecayLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto", 28 | "total_num_steps": "auto" 29 | } 30 | }, 31 | "zero_optimization": { 32 | "stage": 3, 33 | "overlap_comm": true, 34 | "contiguous_gradients": true, 35 | "sub_group_size": 1e9, 36 | "reduce_bucket_size": "auto", 37 | "stage3_prefetch_bucket_size": "auto", 38 | "stage3_param_persistence_threshold": "auto", 39 | "stage3_max_live_parameters": 1e9, 40 | "stage3_max_reuse_distance": 1e9, 41 | "stage3_gather_16bit_weights_on_model_save": true 42 | }, 43 | "gradient_accumulation_steps": "auto", 44 | "gradient_clipping": "auto", 45 | "train_batch_size": "auto", 46 | "train_micro_batch_size_per_gpu": "auto" 47 | } 48 | -------------------------------------------------------------------------------- /deepspeed/cifar10_moe/README.md: -------------------------------------------------------------------------------- 1 | # DeepSpeed CIFAR Example 2 | This example is adapted from the 3 | [CIFAR example in the DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/training/cifar) 4 | repository. It is intended to demonstrate a simple usecase of DeepSpeed with Determined. 5 | 6 | ## Files 7 | * **model_def.py**: The core code for the model. This includes building and compiling the model. 8 | 9 | ### Configuration Files 10 | * **ds_config.json**: The DeepSpeed config file. 11 | * **moe.yaml**: Determined config to train the model with Mixture of Experts enabled. 12 | * **zero_stages.yaml**: Same as `moe.yaml`, but trains the model with ZeRO stage 2 optimizer. 13 | 14 | ## Data 15 | The CIFAR-10 dataset is downloaded from https://www.cs.toronto.edu/~kriz/cifar.html. 16 | 17 | ## To Run 18 | If you have not yet installed Determined, installation instructions can be found 19 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html 20 | 21 | Run the following command: 22 | ``` 23 | det experiment create moe.yaml . 24 | ``` 25 | The other configuration can be run by specifying the appropriate configuration file in place 26 | of `moe.yaml`. 27 | 28 | ## Results 29 | Training the model with the hyperparameter settings in `moe.yaml` should yield 30 | a validation accuracy of ~45% after 2 epochs. 31 | -------------------------------------------------------------------------------- /gan/pix2pix_tf_keras/pix2pix/sampling.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def downsample(filters, size, apply_batchnorm=True): 5 | initializer = tf.random_normal_initializer(0.0, 0.02) 6 | 7 | result = tf.keras.Sequential() 8 | result.add( 9 | tf.keras.layers.Conv2D( 10 | filters, 11 | size, 12 | strides=2, 13 | padding="same", 14 | kernel_initializer=initializer, 15 | use_bias=False, 16 | ) 17 | ) 18 | 19 | if apply_batchnorm: 20 | result.add(tf.keras.layers.BatchNormalization()) 21 | 22 | result.add(tf.keras.layers.LeakyReLU()) 23 | 24 | return result 25 | 26 | 27 | def upsample(filters, size, apply_dropout=False): 28 | initializer = tf.random_normal_initializer(0.0, 0.02) 29 | 30 | result = tf.keras.Sequential() 31 | result.add( 32 | tf.keras.layers.Conv2DTranspose( 33 | filters, 34 | size, 35 | strides=2, 36 | padding="same", 37 | kernel_initializer=initializer, 38 | use_bias=False, 39 | ) 40 | ) 41 | 42 | result.add(tf.keras.layers.BatchNormalization()) 43 | 44 | if apply_dropout: 45 | result.add(tf.keras.layers.Dropout(0.5)) 46 | 47 | result.add(tf.keras.layers.ReLU()) 48 | 49 | return result 50 | -------------------------------------------------------------------------------- /custom_search_method/asha_search_method/local_search_runner/README.md: -------------------------------------------------------------------------------- 1 | # Custom SearchMethod with LocalSearchRunner 2 | 3 | In this example, we use LocalSearchRunner, which executes a custom SearchMethod on your local machine and 4 | orchestrates a multi-trial experiment on a Determined cluster. 5 | 6 | For an example of running the custom SearchMethod on a cluster, 7 | see `examples/custom_search_method/asha_custom_search_method/remote_search_runner`. 8 | 9 | ## Files 10 | * **run_experiment.py**: The code for running the custom SearchMethod locally with LocalSearchRunner. 11 | 12 | ## To Run 13 | If you have not yet installed Determined, installation instructions can be found 14 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html 15 | 16 | 1. Set the `DET_MASTER` environment variable, which is the network address of the Determined master. 17 | For instance, `export DET_MASTER=`. 18 | 2. Run the following command in the `asha_search_method` directory to start LocalSearchRunner: `python local_search_runner/run_experiment.py`. 19 | 20 | ## Result 21 | LocalSearchRunner executes the custom SearchMethod on your local machine, 22 | while the multi-trial experiment for hyperparameter search is started on a Determined cluster. 23 | LocalSearchRunner handles the communication between the custom SearchMethod and the multi-trial experiment. -------------------------------------------------------------------------------- /deepspeed/pipeline_parallelism/distributed.yaml: -------------------------------------------------------------------------------- 1 | name: cifar10_pipeline_parallel_deepspeed 2 | debug: false 3 | hyperparameters: 4 | deepspeed_config: ds_config.json 5 | pipe_parallel_size: 2 6 | part: parameters 7 | overwrite_deepspeed_args: 8 | train_micro_batch_size_per_gpu: 8 9 | bind_mounts: 10 | - host_path: /tmp 11 | container_path: /data 12 | - host_path: /tmp 13 | container_path: /root/.cache 14 | environment: 15 | #force_pull_image: true 16 | environment_variables: 17 | - NCCL_DEBUG=INFO 18 | # You may need to modify this to match your network configuration. 19 | - NCCL_SOCKET_IFNAME=ens,eth,ib 20 | # - CUDA_LAUNCH_BLOCKING=1 21 | # - NCCL_BLOCKING_WAIT=1 22 | image: 23 | gpu: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-2b7e2a1 24 | resources: 25 | slots_per_trial: 2 26 | records_per_epoch: 50000 27 | searcher: 28 | name: single 29 | metric: loss 30 | smaller_is_better: false 31 | max_length: 32 | batches: 1000 33 | entrypoint: 34 | - python3 35 | - -m 36 | - determined.launch.deepspeed 37 | - --trial 38 | - model_def:CIFARTrial 39 | max_restarts: 0 40 | checkpoint_policy: none 41 | -------------------------------------------------------------------------------- /features/torch_batch_process_core_api_comparison/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Any, Dict, Optional, Sequence, Union 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from determined.experimental import client 8 | 9 | # Constants about the data set. 10 | IMAGE_SIZE = 32 11 | NUM_CHANNELS = 3 12 | NUM_CLASSES = 10 13 | 14 | TorchData = Union[Dict[str, torch.Tensor], Sequence[torch.Tensor], torch.Tensor] 15 | 16 | 17 | class Flatten(nn.Module): 18 | def forward(self, *args: TorchData, **kwargs: Any) -> torch.Tensor: 19 | assert len(args) == 1 20 | x = args[0] 21 | assert isinstance(x, torch.Tensor) 22 | return x.contiguous().view(x.size(0), -1) 23 | 24 | 25 | def build_model(): 26 | model = nn.Sequential( 27 | nn.Conv2d(NUM_CHANNELS, IMAGE_SIZE, kernel_size=(3, 3)), 28 | nn.ReLU(), 29 | nn.Conv2d(32, 32, kernel_size=(3, 3)), 30 | nn.ReLU(), 31 | nn.MaxPool2d((2, 2)), 32 | nn.Dropout2d(0.25), 33 | nn.Conv2d(32, 64, (3, 3), padding=1), 34 | nn.ReLU(), 35 | nn.Conv2d(64, 64, (3, 3)), 36 | nn.ReLU(), 37 | nn.MaxPool2d((2, 2)), 38 | nn.Dropout2d(0.25), 39 | Flatten(), 40 | nn.Linear(2304, 512), 41 | nn.ReLU(), 42 | nn.Dropout2d(0.5), 43 | nn.Linear(512, NUM_CLASSES), 44 | ) 45 | return model 46 | -------------------------------------------------------------------------------- /blog/llm-finetuning-2/ds_configs/ds_config_stage_2_cpu_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "optimizer": { 11 | "type": "AdamW", 12 | "params": { 13 | "lr": "auto", 14 | "betas": "auto", 15 | "eps": "auto", 16 | "weight_decay": "auto" 17 | } 18 | }, 19 | "scheduler": { 20 | "type": "WarmupLR", 21 | "params": { 22 | "warmup_min_lr": "auto", 23 | "warmup_max_lr": "auto", 24 | "warmup_num_steps": "auto" 25 | } 26 | }, 27 | "zero_optimization": { 28 | "stage": 2, 29 | "offload_optimizer": { 30 | "device": "cpu", 31 | "pin_memory": true 32 | }, 33 | "allgather_partitions": true, 34 | "allgather_bucket_size": 2e8, 35 | "overlap_comm": true, 36 | "reduce_scatter": true, 37 | "reduce_bucket_size": 2e8, 38 | "contiguous_gradients": true 39 | }, 40 | "gradient_accumulation_steps": "auto", 41 | "gradient_clipping": "auto", 42 | "train_batch_size": "auto", 43 | "train_micro_batch_size_per_gpu": "auto", 44 | "flops_profiler": { 45 | "enabled": true, 46 | "profile_step": 1, 47 | "module_depth": -1, 48 | "top_modules": 1, 49 | "detailed": true, 50 | "output_file": null 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /blog/llm-finetuning-3/ds_configs/ds_config_stage_2_cpu_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "optimizer": { 11 | "type": "AdamW", 12 | "params": { 13 | "lr": "auto", 14 | "betas": "auto", 15 | "eps": "auto", 16 | "weight_decay": "auto" 17 | } 18 | }, 19 | "scheduler": { 20 | "type": "WarmupLR", 21 | "params": { 22 | "warmup_min_lr": "auto", 23 | "warmup_max_lr": "auto", 24 | "warmup_num_steps": "auto" 25 | } 26 | }, 27 | "zero_optimization": { 28 | "stage": 2, 29 | "offload_optimizer": { 30 | "device": "cpu", 31 | "pin_memory": true 32 | }, 33 | "allgather_partitions": true, 34 | "allgather_bucket_size": 2e8, 35 | "overlap_comm": true, 36 | "reduce_scatter": true, 37 | "reduce_bucket_size": 2e8, 38 | "contiguous_gradients": true 39 | }, 40 | "gradient_accumulation_steps": "auto", 41 | "gradient_clipping": "auto", 42 | "train_batch_size": "auto", 43 | "train_micro_batch_size_per_gpu": "auto", 44 | "flops_profiler": { 45 | "enabled": false, 46 | "profile_step": 1, 47 | "module_depth": -1, 48 | "top_modules": 1, 49 | "detailed": true, 50 | "output_file": null 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /blog/llm-finetuning/README.md: -------------------------------------------------------------------------------- 1 | # LLM Finetuning using HuggingFace + Determined 2 | 3 | In this demo, we finetune the [TinyLlama-1.1B-Chat](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.4) on a [text-to-SQL dataset](https://huggingface.co/datasets/Clinton/Text-to-sql-v1). We ran this on two 80 GB A100 GPUs. 4 | 5 | To get started, first install Determined on your local machine: 6 | ```bash 7 | pip install determined 8 | ``` 9 | 10 | Then finetune: 11 | ```bash 12 | det e create distributed.yaml . 13 | ``` 14 | 15 | Change configuration options in `distributed.yaml`. Some important options are: 16 | - `slots_per_trial`: the number of GPUs to use. 17 | - `dataset_subset`: the difficulty subset to train on. 18 | - `per_device_train_batch_size`: the batch size per GPU. 19 | 20 | 21 | Test your model's generation capabilities: 22 | 23 | ```bash 24 | python test_model.py --exp_id --dataset_subset 25 | ``` 26 | 27 | Where 28 | - `` is the id of your finetuning experiment in the Determined UI. 29 | - `` is one of "easy", "medium", or "hard". 30 | 31 | To test the pretrained model (not finetuned), leave out `--exp_id`. For example: 32 | 33 | ```bash 34 | python test_model.py --dataset_subset easy 35 | ``` 36 | 37 | ## Contributors 38 | 39 | - [Kevin Musgrave](https://github.com/KevinMusgrave) 40 | - [Agnieszka Ciborowska](https://github.com/aciborowska) -------------------------------------------------------------------------------- /computer_vision/fasterrcnn_coco_pytorch/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch Faster R-CNN Example 2 | 3 | This example shows how to build an object detection model on the Penn-Fudan 4 | Database using Determined's PyTorch API. This example is adapted from this [PyTorch 5 | Mask R-CNN tutorial](https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html) 6 | 7 | ## Files 8 | * **model_def.py**: The core code for the model. This includes building and compiling the model. 9 | 10 | ### Configuration Files 11 | * **const.yaml**: Train the model with constant hyperparameter values. 12 | * **adaptive.yaml**: Perform a hyperparameter search using Determined's state-of-the-art adaptive hyperparameter tuning algorithm. 13 | 14 | ## Data 15 | The current implementation uses the pedestrian detection and segmentation 16 | [Penn-Fudan Database](https://www.cis.upenn.edu/~jshi/ped_html/). 17 | 18 | ## To Run 19 | If you have not yet installed Determined, installation instructions can be found 20 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html 21 | 22 | Run the following command: `det -m experiment create -f 23 | const.yaml .`. The other configurations can be run by specifying the appropriate 24 | configuration file in place of `const.yaml`. 25 | 26 | ## Results 27 | Training the model with the hyperparameter settings in `const.yaml` should yield 28 | an IOU of ~0.42. 29 | -------------------------------------------------------------------------------- /blog/act-mem-2/mlp_script.py: -------------------------------------------------------------------------------- 1 | """ 2 | Prints out the ratio of activation memory for the MLP layer when using ReLU vs GELU. 3 | """ 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | import act_mem 9 | import layers 10 | 11 | if __name__ == "__main__": 12 | batch_size, seq_len, d_model = 2, 4096, 1024 13 | dtype = torch.bfloat16 14 | inputs = torch.randn( 15 | batch_size, 16 | seq_len, 17 | d_model, 18 | device="cuda", 19 | requires_grad=True, 20 | dtype=dtype, 21 | ) 22 | 23 | act_fn_dict = {"ReLU": nn.ReLU(), "GELU": nn.GELU()} 24 | # Append outputs to a list to keep tensors alive 25 | outputs = [] 26 | mem_bytes = [] 27 | 28 | for name, act_fn in act_fn_dict.items(): 29 | mlp = layers.MLP( 30 | d_model=d_model, 31 | act_fn=act_fn, 32 | device="cuda", 33 | dtype=dtype, 34 | ) 35 | with act_mem.AllocatedMemContext() as mem, act_mem.SavedTensorContext( 36 | ignored_tensors=mlp.parameters() 37 | ) as saved: 38 | out = mlp(inputs) 39 | outputs.append(out) 40 | assert mem.delta["current"] == saved.saved_tensor_mem 41 | print(f"{name} bytes: {saved.saved_tensor_mem}") 42 | mem_bytes.append(saved.saved_tensor_mem) 43 | 44 | print(f"ReLU/GELU act mem ratio: {mem_bytes[0]/mem_bytes[1]}") 45 | -------------------------------------------------------------------------------- /blog/act-mem-2/block_script.py: -------------------------------------------------------------------------------- 1 | """ 2 | Prints out the ratio of activation memory for the a transformer Block when using ReLU vs GELU. 3 | """ 4 | 5 | import torch 6 | import torch.nn as nn 7 | 8 | import act_mem 9 | import layers 10 | 11 | if __name__ == "__main__": 12 | batch_size, seq_len, d_model, n_heads = 2, 4096, 1024, 2 13 | dtype = torch.bfloat16 14 | inputs = torch.randn( 15 | batch_size, 16 | seq_len, 17 | d_model, 18 | device="cuda", 19 | requires_grad=True, 20 | dtype=dtype, 21 | ) 22 | 23 | act_fn_dict = {"ReLU": nn.ReLU(), "GELU": nn.GELU()} 24 | # Append outputs to a list to keep tensors alive 25 | outputs = [] 26 | mem_bytes = [] 27 | 28 | for name, act_fn in act_fn_dict.items(): 29 | block = layers.Block( 30 | d_model=d_model, 31 | act_fn=act_fn, 32 | n_heads=n_heads, 33 | device="cuda", 34 | dtype=dtype, 35 | ) 36 | with act_mem.AllocatedMemContext() as mem, act_mem.SavedTensorContext( 37 | ignored_tensors=block.parameters() 38 | ) as saved: 39 | out = block(inputs) 40 | outputs.append(out) 41 | print(f"{name} block bytes: {saved.saved_tensor_mem}") 42 | mem_bytes.append(saved.saved_tensor_mem) 43 | 44 | print(f"ReLU/GeLU block act mem ratio: {mem_bytes[0]/mem_bytes[1]}") 45 | -------------------------------------------------------------------------------- /features/hp_constraints_mnist_pytorch/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch HP Search Constraints (MNIST) 2 | This tutorial shows how to use Determined's HP Search Constraints with 3 | PyTorch. In this example, the constraints are defined in Lines 56-57 of 4 | the `__init__` function in `model_def.py` based on the model hyperparameters 5 | via the `det.InvalidHP` exception API (see the `HP Search Constraints` topic 6 | guide under https://docs.determined.ai/latest/topic-guides/index.html 7 | 8 | Constraints can also be defined in `train_batch` and `evaluate_batch`, 9 | where an InvalidHP exception can be raised based on 10 | training and validation metrics respectively. 11 | 12 | This example is based on Determined's `mnist_pytorch` tutorial, with the 13 | addition of the HP search constraint as the only modification. 14 | 15 | ## Files 16 | * **model_def.py**: Where the HP Search constraint is defined and used. 17 | * All other files are identical to the `mnist_pytorch` tutorial code. 18 | 19 | ## To Run 20 | If you have not yet installed Determined, installation instructions can be found 21 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html 22 | 23 | Run the following command: `det -m experiment create -f 24 | adaptive.yaml .`. 25 | 26 | ## Results 27 | Training the model with the hyperparameter settings in `adaptive.yaml` should yield 28 | a validation accuracy of ~97%. 29 | -------------------------------------------------------------------------------- /model_hub/huggingface/multiple-choice/swag_config.yaml: -------------------------------------------------------------------------------- 1 | name: huggingface_swag_trial 2 | hyperparameters: 3 | pretrained_model_name_or_path: roberta-base 4 | model_mode: multiple-choice 5 | use_pretrained_weights: true 6 | use_apex_amp: true 7 | cache_dir: null 8 | # Training Args 9 | global_batch_size: 64 10 | learning_rate: 5.0e-5 11 | adam_epsilon: 1.0e-8 12 | weight_decay: 0 13 | lr_scheduler_type: linear 14 | num_warmup_steps: 0 15 | data: 16 | dataset_name: swag 17 | dataset_config_name: regular 18 | train_file: null 19 | validation_file: null 20 | overwrite_cache: false 21 | preprocessing_num_workers: null 22 | max_seq_length: 128 23 | pad_to_max_length: false 24 | # Number of records per epoch differs based on max_seq_length. 25 | records_per_epoch: 73546 26 | min_validation_period: 27 | batches: 500 28 | searcher: 29 | name: single 30 | metric: accuracy 31 | max_length: 32 | epochs: 3 33 | smaller_is_better: false 34 | environment: 35 | image: 36 | gpu: determinedai/model-hub-transformers:0.26.2-dev0 37 | resources: 38 | slots_per_trial: 2 39 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the 40 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance. 41 | bind_mounts: 42 | - host_path: /tmp 43 | container_path: /root/.cache 44 | entrypoint: swag_trial:SWAGTrial 45 | -------------------------------------------------------------------------------- /model_hub/huggingface/language-modeling/clm_config.yaml: -------------------------------------------------------------------------------- 1 | name: huggingface_clm_trial 2 | hyperparameters: 3 | pretrained_model_name_or_path: gpt2 4 | model_mode: causal-lm 5 | use_pretrained_weights: true 6 | use_apex_amp: false 7 | cache_dir: null 8 | # Training Args 9 | global_batch_size: 8 10 | learning_rate: 5.0e-5 11 | adam_epsilon: 1.0e-8 12 | weight_decay: 0 13 | lr_scheduler_type: linear 14 | num_warmup_steps: 0 15 | data: 16 | dataset_name: wikitext 17 | dataset_config_name: wikitext-2-raw-v1 18 | train_file: null 19 | validation_file: null 20 | max_seq_length: null 21 | overwrite_cache: false 22 | validation_split_percentage: 5 23 | preprocessing_num_workers: null 24 | # Number of records per epoch differs based on max_seq_length. 25 | records_per_epoch: 2318 26 | min_validation_period: 27 | batches: 500 28 | searcher: 29 | name: single 30 | metric: perplexity 31 | max_length: 32 | epochs: 3 33 | smaller_is_better: true 34 | environment: 35 | image: 36 | gpu: determinedai/model-hub-transformers:0.26.2-dev0 37 | resources: 38 | slots_per_trial: 4 39 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the 40 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance. 41 | bind_mounts: 42 | - host_path: /tmp 43 | container_path: /root/.cache 44 | entrypoint: clm_trial:CLMTrial 45 | -------------------------------------------------------------------------------- /blog/lora-parameters/ds_configs/ds_config_stage_3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupDecayLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto", 28 | "total_num_steps": "auto" 29 | } 30 | }, 31 | "zero_optimization": { 32 | "stage": 3, 33 | "overlap_comm": true, 34 | "contiguous_gradients": true, 35 | "sub_group_size": 1e9, 36 | "reduce_bucket_size": "auto", 37 | "stage3_prefetch_bucket_size": "auto", 38 | "stage3_param_persistence_threshold": "auto", 39 | "stage3_max_live_parameters": 1e9, 40 | "stage3_max_reuse_distance": 1e9, 41 | "stage3_gather_16bit_weights_on_model_save": true 42 | }, 43 | "gradient_accumulation_steps": "auto", 44 | "gradient_clipping": "auto", 45 | "train_batch_size": "auto", 46 | "train_micro_batch_size_per_gpu": "auto" 47 | } -------------------------------------------------------------------------------- /gan/gan_mnist_pytorch/data.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | import urllib.parse 5 | from typing import Any, Dict 6 | 7 | import requests 8 | from torchvision import datasets, transforms 9 | 10 | 11 | def get_dataset(data_dir: str, train: bool) -> Any: 12 | return datasets.MNIST( 13 | data_dir, 14 | train=train, 15 | transform=transforms.Compose( 16 | [ 17 | transforms.ToTensor(), 18 | transforms.Normalize((0.5,), (0.5,)), 19 | ] 20 | ), 21 | ) 22 | 23 | 24 | def download_dataset(download_directory: str, data_config: Dict[str, Any]) -> str: 25 | url = data_config["url"] 26 | url_path = urllib.parse.urlparse(url).path 27 | basename = url_path.rsplit("/", 1)[1] 28 | 29 | download_directory = os.path.join(download_directory, "MNIST") 30 | os.makedirs(download_directory, exist_ok=True) 31 | filepath = os.path.join(download_directory, basename) 32 | if not os.path.exists(filepath): 33 | logging.info("Downloading {} to {}".format(url, filepath)) 34 | 35 | r = requests.get(url, stream=True) 36 | with open(filepath, "wb") as f: 37 | for chunk in r.iter_content(chunk_size=8192): 38 | if chunk: 39 | f.write(chunk) 40 | 41 | shutil.unpack_archive(filepath, download_directory) 42 | 43 | return os.path.dirname(download_directory) 44 | -------------------------------------------------------------------------------- /computer_vision/deformabledetr_coco_pytorch/startup-hook.sh: -------------------------------------------------------------------------------- 1 | apt-get update 2 | apt-get install unzip 3 | 4 | # Download COCO 2017 annotations 5 | wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip 6 | unzip -o annotations_trainval2017.zip 7 | mv annotations/instances_train2017.json /tmp 8 | mv annotations/instances_val2017.json /tmp 9 | 10 | # Clone Deformable-DETR library from source. 11 | # Since it is not an installable pacakge, we will have to add this to system path to import functions from it. 12 | git clone https://github.com/fundamentalvision/Deformable-DETR ddetr 13 | cd ddetr && git reset --hard 11169a60c33333af00a4849f1808023eba96a931 14 | # Need to fix a bug in the original code that fails to handle torchvision version 0.10 correctly. 15 | # Deformable DETR has some changes from DETR that need additional handling. 16 | sed -i 's/float(torchvision\.__version__\[:3\]) < 0.5/int(torchvision\.__version__.split("\.")\[1\]) < 7/g' util/misc.py 17 | sed -i 's/float(torchvision\.__version__\[:3\]) < 0.7/int(torchvision\.__version__.split("\.")\[1\]) < 7/g' util/misc.py 18 | 19 | pip install tqdm attrdict pycocotools cython scipy 20 | 21 | # Build custom cuda ops 22 | cd models/ops 23 | sh ./make.sh 24 | cd ../../.. 25 | 26 | # Download pretrained model using link from https://github.com/fundamentalvision/Deformable-DETR 27 | pip install gdown 28 | gdown https://drive.google.com/uc?id=1nDWZWHuRwtwGden77NLM9JoWe-YisJnA -O model.ckpt 29 | -------------------------------------------------------------------------------- /model_hub/huggingface/token-classification/ner_config.yaml: -------------------------------------------------------------------------------- 1 | name: huggingface_ner_trial 2 | hyperparameters: 3 | pretrained_model_name_or_path: bert-base-uncased 4 | model_mode: token-classification 5 | finetuning_task: ner 6 | use_pretrained_weights: true 7 | use_apex_amp: false 8 | # Training Args 9 | global_batch_size: 8 10 | learning_rate: 5.0e-5 11 | adam_epsilon: 1.0e-8 12 | weight_decay: 0 13 | lr_scheduler_type: linear 14 | num_warmup_steps: 0 15 | data: 16 | dataset_name: conllpp 17 | dataset_config_name: null 18 | train_file: null 19 | validation_file: null 20 | preprocessing_num_workers: null 21 | cache_dir: null 22 | overwrite_cache: false 23 | pad_to_max_length: false 24 | label_all_tokens: false 25 | # Number of records per epoch differs based on max_seq_length. 26 | records_per_epoch: 14041 27 | min_validation_period: 28 | batches: 500 29 | searcher: 30 | name: single 31 | metric: accuracy_score 32 | max_length: 33 | epochs: 3 34 | smaller_is_better: false 35 | environment: 36 | image: 37 | gpu: determinedai/model-hub-transformers:0.26.2-dev0 38 | resources: 39 | slots_per_trial: 1 40 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the 41 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance. 42 | bind_mounts: 43 | - host_path: /tmp 44 | container_path: /root/.cache 45 | entrypoint: ner_trial:NERTrial 46 | -------------------------------------------------------------------------------- /model_hub/huggingface/text-classification/xnli_config.yaml: -------------------------------------------------------------------------------- 1 | name: huggingface_xnli_trial 2 | hyperparameters: 3 | pretrained_model_name_or_path: bert-base-multilingual-cased 4 | model_mode: sequence-classification 5 | finetuning_task: xnli 6 | use_apex_amp: false 7 | use_pretrained_weights: true 8 | do_lower_case: false 9 | # Training Args 10 | global_batch_size: 32 11 | learning_rate: 5.0e-5 12 | adam_epsilon: 1.0e-8 13 | weight_decay: 0 14 | lr_scheduler_type: linear 15 | num_warmup_steps: 0 16 | data: 17 | dataset_name: xnli 18 | language: de 19 | train_language: en 20 | max_seq_length: 128 21 | train_file: null 22 | validation_file: null 23 | preprocessing_num_workers: null 24 | cache_dir: null 25 | overwrite_cache: false 26 | pad_to_max_length: true 27 | # Number of records per epoch differs based on max_seq_length. 28 | records_per_epoch: 392702 29 | min_validation_period: 30 | batches: 500 31 | searcher: 32 | name: single 33 | metric: accuracy 34 | max_length: 35 | epochs: 2 36 | smaller_is_better: false 37 | environment: 38 | image: 39 | gpu: determinedai/model-hub-transformers:0.26.2-dev0 40 | resources: 41 | slots_per_trial: 2 42 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the 43 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance. 44 | bind_mounts: 45 | - host_path: /tmp 46 | container_path: /root/.cache 47 | entrypoint: xnli_trial:XNLITrial 48 | -------------------------------------------------------------------------------- /computer_vision/cifar10_tf_keras/README.md: -------------------------------------------------------------------------------- 1 | # TensorFlow (tf.keras) CIFAR-10 CNN Example 2 | 3 | This example shows how to build a simple CNN on the CIFAR-10 dataset using 4 | Determined's tf.keras API. This example is adapted from this [Keras CNN 5 | example](https://github.com/keras-team/keras/blob/keras-2/examples/cifar10_cnn.py). 6 | 7 | ## Files 8 | * **model_def.py**: Organizes the model and data-loaders into the Determined TFKerasTrial API. 9 | * **cifar_model.py**: The core code for the model. This includes building and compiling the model. 10 | 11 | ### Configuration Files 12 | * **const.yaml**: Train the model with constant hyperparameter values. 13 | * **distributed.yaml**: Same as `const.yaml`, but instead uses multiple GPUs. 14 | * **adaptive.yaml**: Perform a hyperparameter search using Determined's state-of-the-art adaptive hyperparameter tuning algorithm. 15 | 16 | ## Data: 17 | The current implementation uses CIFAR-10 data downloaded from AWS S3. 18 | 19 | ## To Run: 20 | If you have not yet installed Determined, installation instructions can be found 21 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html 22 | 23 | Run the following command: `det -m experiment create -f 24 | const.yaml .`. The other configurations can be run by specifying the appropriate 25 | configuration file in place of `const.yaml`. 26 | 27 | ## Results: 28 | Training the model with the hyperparameter settings in `const.yaml` should yield 29 | a validation accuracy of ~74%. 30 | -------------------------------------------------------------------------------- /model_hub/huggingface/language-modeling/mlm_config.yaml: -------------------------------------------------------------------------------- 1 | name: huggingface_mlm_trial 2 | hyperparameters: 3 | pretrained_model_name_or_path: roberta-base 4 | model_mode: masked-lm 5 | use_pretrained_weights: true 6 | use_apex_amp: true 7 | cache_dir: null 8 | # Training Args 9 | global_batch_size: 8 10 | learning_rate: 5.0e-5 11 | adam_epsilon: 1.0e-8 12 | weight_decay: 0 13 | lr_scheduler_type: linear 14 | num_warmup_steps: 0 15 | data: 16 | dataset_name: wikitext 17 | dataset_config_name: wikitext-2-raw-v1 18 | train_file: null 19 | validation_file: null 20 | overwrite_cache: false 21 | validation_split_percentage: 5 22 | max_seq_length: null 23 | preprocessing_num_workers: null 24 | mlm_probability: 0.15 25 | line_by_line: false 26 | pad_to_max_length: false 27 | # Number of records per epoch differs based on max_seq_length. 28 | records_per_epoch: 4798 29 | min_validation_period: 30 | batches: 500 31 | searcher: 32 | name: single 33 | metric: perplexity 34 | max_length: 35 | epochs: 3 36 | smaller_is_better: true 37 | environment: 38 | image: 39 | gpu: determinedai/model-hub-transformers:0.26.2-dev0 40 | resources: 41 | slots_per_trial: 1 42 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the 43 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance. 44 | bind_mounts: 45 | - host_path: /tmp 46 | container_path: /root/.cache 47 | entrypoint: mlm_trial:MLMTrial 48 | -------------------------------------------------------------------------------- /gan/cyclegan/datasets.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import random 3 | import os 4 | 5 | from torch.utils.data import Dataset 6 | from PIL import Image 7 | import torchvision.transforms as transforms 8 | 9 | 10 | def to_rgb(image): 11 | rgb_image = Image.new("RGB", image.size) 12 | rgb_image.paste(image) 13 | return rgb_image 14 | 15 | 16 | class ImageDataset(Dataset): 17 | def __init__(self, root, transforms_=None, unaligned=False, mode="train"): 18 | self.transform = transforms.Compose(transforms_) 19 | self.unaligned = unaligned 20 | 21 | self.files_A = sorted(glob.glob(os.path.join(root, "%s/A" % mode) + "/*.*")) 22 | self.files_B = sorted(glob.glob(os.path.join(root, "%s/B" % mode) + "/*.*")) 23 | 24 | def __getitem__(self, index): 25 | image_A = Image.open(self.files_A[index % len(self.files_A)]) 26 | 27 | if self.unaligned: 28 | image_B = Image.open(self.files_B[random.randint(0, len(self.files_B) - 1)]) 29 | else: 30 | image_B = Image.open(self.files_B[index % len(self.files_B)]) 31 | 32 | # Convert grayscale images to rgb 33 | if image_A.mode != "RGB": 34 | image_A = to_rgb(image_A) 35 | if image_B.mode != "RGB": 36 | image_B = to_rgb(image_B) 37 | 38 | item_A = self.transform(image_A) 39 | item_B = self.transform(image_B) 40 | return {"A": item_A, "B": item_B} 41 | 42 | def __len__(self): 43 | return max(len(self.files_A), len(self.files_B)) 44 | -------------------------------------------------------------------------------- /computer_vision/efficientdet_pytorch/efficientdet_files/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import numpy as np 4 | from PIL import Image 5 | from torch.utils.data import Dataset 6 | 7 | 8 | class FakeParser: 9 | def __init__(self): 10 | self.img_ids = [] 11 | 12 | def create_fake_img_ids(self, num_indices): 13 | self.img_ids = [np.random.randint(1, 90) for i in range(num_indices)] 14 | 15 | 16 | class FakeBackend(Dataset): 17 | def __init__(self, transform=None): 18 | self.transform = transform 19 | 20 | def __len__(self): 21 | return 1000 22 | 23 | def __getitem__(self, i): 24 | target = dict(img_idx=i, img_size=(512, 512)) 25 | 26 | img = Image.open("loss_by_gpus.png").convert("RGB") 27 | img = img.resize((512, 512)) 28 | 29 | if self.transform is not None: 30 | img, target = self.transform(img, target) 31 | 32 | target["bbox"] = np.random.rand(2, 4) 33 | target["cls"] = np.array([np.random.randint(90), np.random.randint(90)]) 34 | 35 | return img, target 36 | 37 | 38 | class DotDict(dict): 39 | __setattr__ = dict.__setitem__ 40 | __delattr__ = dict.__delitem__ 41 | 42 | def __init__(self, dct): 43 | for key, value in dct.items(): 44 | if value == "None": 45 | value = None 46 | self[key] = value 47 | 48 | def __getattr__(self, name): 49 | try: 50 | return self[name] 51 | except: 52 | return None 53 | -------------------------------------------------------------------------------- /computer_vision/detectron2_coco_pytorch/Base-RCNN-FPN.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedRCNN" 3 | BACKBONE: 4 | NAME: "build_resnet_fpn_backbone" 5 | RESNETS: 6 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 7 | FPN: 8 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 9 | ANCHOR_GENERATOR: 10 | SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map 11 | ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps) 12 | RPN: 13 | IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"] 14 | PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level 15 | PRE_NMS_TOPK_TEST: 1000 # Per FPN level 16 | # Detectron1 uses 2000 proposals per-batch, 17 | # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue) 18 | # which is approximately 1000 proposals per-image since the default batch size for FPN is 2. 19 | POST_NMS_TOPK_TRAIN: 1000 20 | POST_NMS_TOPK_TEST: 1000 21 | ROI_HEADS: 22 | NAME: "StandardROIHeads" 23 | IN_FEATURES: ["p2", "p3", "p4", "p5"] 24 | ROI_BOX_HEAD: 25 | NAME: "FastRCNNConvFCHead" 26 | NUM_FC: 2 27 | POOLER_RESOLUTION: 7 28 | ROI_MASK_HEAD: 29 | NAME: "MaskRCNNConvUpsampleHead" 30 | NUM_CONV: 4 31 | POOLER_RESOLUTION: 14 32 | DATASETS: 33 | TRAIN: ("coco_2017_train",) 34 | TEST: ("coco_2017_val",) 35 | SOLVER: 36 | IMS_PER_BATCH: 16 37 | BASE_LR: 0.02 38 | STEPS: (60000, 80000) 39 | MAX_ITER: 90000 40 | INPUT: 41 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 42 | VERSION: 2 -------------------------------------------------------------------------------- /nas/gaea_pytorch/search/data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch.utils.data import Dataset 3 | 4 | 5 | class BilevelDataset(Dataset): 6 | def __init__( 7 | self, 8 | dataset, 9 | ): 10 | """ 11 | We will split the data into a train split and a validation split 12 | and return one image from each split as a single observation. 13 | 14 | Args: 15 | dataset: PyTorch Dataset object 16 | """ 17 | inds = np.arange(len(dataset)) 18 | self.dataset = dataset 19 | # Make sure train and val splits are of equal size. 20 | # This is so we make sure to loop images in both train 21 | # and val splits exactly once in an epoch. 22 | n_train = int(0.5 * len(inds)) 23 | self.train_inds = inds[0:n_train] 24 | self.val_inds = inds[n_train : 2 * n_train] 25 | assert len(self.train_inds) == len(self.val_inds) 26 | 27 | def shuffle_val_inds(self): 28 | # This is so we will see different pairs of images 29 | # from train and val splits. Will need to call this 30 | # manually at epoch end. 31 | np.random.shuffle(self.val_inds) 32 | 33 | def __len__(self): 34 | return len(self.train_inds) 35 | 36 | def __getitem__(self, idx): 37 | train_ind = self.train_inds[idx] 38 | val_ind = self.val_inds[idx] 39 | x_train, y_train = self.dataset[train_ind] 40 | x_val, y_val = self.dataset[val_ind] 41 | return x_train, y_train, x_val, y_val 42 | -------------------------------------------------------------------------------- /model_hub/huggingface/language-modeling/plm_config.yaml: -------------------------------------------------------------------------------- 1 | name: huggingface_plm_trial 2 | hyperparameters: 3 | pretrained_model_name_or_path: xlnet-base-cased 4 | model_mode: causal-lm 5 | use_pretrained_weights: true 6 | use_apex_amp: false 7 | cache_dir: null 8 | # Training Args 9 | global_batch_size: 2 10 | learning_rate: 2.0e-5 11 | adam_epsilon: 1.0e-8 12 | weight_decay: 0 13 | lr_scheduler_type: linear 14 | num_warmup_steps: 0 15 | data: 16 | dataset_name: wikitext 17 | dataset_config_name: wikitext-2-raw-v1 18 | train_file: null 19 | validation_file: null 20 | overwrite_cache: false 21 | validation_split_percentage: 5 22 | max_seq_length: 512 23 | preprocessing_num_workers: null 24 | plm_probability: 0.15 25 | max_span_length: 5 26 | line_by_line: false 27 | pad_to_max_length: false 28 | # Number of records per epoch differs based on max_seq_length. 29 | records_per_epoch: 5334 30 | min_validation_period: 31 | batches: 500 32 | searcher: 33 | name: single 34 | metric: perplexity 35 | max_length: 36 | epochs: 3 37 | smaller_is_better: true 38 | environment: 39 | image: 40 | gpu: determinedai/model-hub-transformers:0.26.2-dev0 41 | resources: 42 | slots_per_trial: 2 43 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the 44 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance. 45 | bind_mounts: 46 | - host_path: /tmp 47 | container_path: /root/.cache 48 | entrypoint: plm_trial:PLMTrial 49 | -------------------------------------------------------------------------------- /hp_search_benchmarks/darts_penntreebank_pytorch/const.yaml: -------------------------------------------------------------------------------- 1 | name: darts_rnn_nas 2 | 3 | data: 4 | data_download_dir: /data 5 | 6 | bind_mounts: 7 | - host_path: /tmp 8 | container_path: /data 9 | read_only: false 10 | 11 | hyperparameters: 12 | learning_rate: 20 13 | global_batch_size: 64 14 | # Epoch to start checking whether we should switch to 15 | # ASGD instead of SGD. 16 | optimizer_switch_epoch: 75 17 | eval_batch_size: 10 18 | emsize: 850 19 | nhid: 850 20 | nhidlast: 850 21 | bptt: 35 22 | dropout: 0.75 23 | dropouth: 0.25 24 | dropoutx: 0.75 25 | dropouti: 0.2 26 | dropoute: 0.1 27 | nonmono: 5 28 | alpha: 0 29 | beta: 1.0e-3 30 | weight_decay: 8.0e-7 31 | max_seq_length_delta: 20 32 | clip_gradients_l2_norm: 0.25 33 | 34 | 35 | # Tunable hyperparameters 36 | node1_edge: 0 37 | node2_edge: 1 38 | node3_edge: 1 39 | node4_edge: 1 40 | node5_edge: 2 41 | node6_edge: 5 42 | node7_edge: 3 43 | node8_edge: 5 44 | 45 | node1_op: sigmoid 46 | node2_op: relu 47 | node3_op: relu 48 | node4_op: identity 49 | node5_op: tanh 50 | node6_op: sigmoid 51 | node7_op: tanh 52 | node8_op: relu 53 | 54 | resources: 55 | slots_per_trial: 2 56 | 57 | scheduling_unit: 100 58 | 59 | min_validation_period: 60 | batches: 400 61 | 62 | optimizations: 63 | average_training_metrics: true 64 | 65 | searcher: 66 | name: single 67 | metric: loss 68 | max_length: 69 | batches: 10000 70 | smaller_is_better: true 71 | 72 | entrypoint: model_def:DARTSRNNTrial 73 | -------------------------------------------------------------------------------- /computer_vision/cifar10_pytorch/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch CIFAR-10 CNN Example 2 | 3 | This example shows how to build a simple CNN on the CIFAR-10 dataset using 4 | Determined's PyTorch API. This example is adapted from this [Keras CNN 5 | example](https://github.com/keras-team/keras/blob/keras-2/examples/cifar10_cnn.py). 6 | 7 | ## Files 8 | * **model_def.py**: The core code for the model. This includes building and compiling the model. 9 | 10 | ### Configuration Files 11 | * **const.yaml**: Train the model with constant hyperparameter values. 12 | * **adaptive.yaml**: Perform a hyperparameter search using Determined's state-of-the-art adaptive hyperparameter tuning algorithm. 13 | * **distributed.yaml**: Same as `const.yaml`, but trains the model with multiple GPUs (distributed training). 14 | * **distributed_inference.yaml**: Use the distributed training workflow with PyTorchTrial to accelerate batch inference workloads. 15 | 16 | ## Data 17 | The CIFAR-10 dataset is downloaded from https://www.cs.toronto.edu/~kriz/cifar.html. 18 | 19 | ## To Run 20 | If you have not yet installed Determined, installation instructions can be found 21 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html 22 | 23 | Run the following command: `det -m experiment create -f 24 | const.yaml .`. The other configurations can be run by specifying the appropriate 25 | configuration file in place of `const.yaml`. 26 | 27 | ## Results 28 | Training the model with the hyperparameter settings in `const.yaml` should yield a validation accuracy of ~74%. 29 | -------------------------------------------------------------------------------- /deepspeed/cifar10_cpu_offloading/ds_config_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 128, 3 | "steps_per_print": 10, 4 | "optimizer": { 5 | "type": "Adam", 6 | "params": { 7 | "lr": 0.001, 8 | "betas": [ 9 | 0.8, 10 | 0.999 11 | ], 12 | "eps": 1e-8, 13 | "weight_decay": 3e-7 14 | } 15 | }, 16 | "scheduler": { 17 | "type": "WarmupLR", 18 | "params": { 19 | "warmup_min_lr": 0, 20 | "warmup_max_lr": 0.001, 21 | "warmup_num_steps": 1000 22 | } 23 | }, 24 | "zero_optimization": { 25 | "stage": 3, 26 | "offload_optimizer": { 27 | "device": "cpu", 28 | "pin_memory": true, 29 | "buffer_count": 4, 30 | "fast_init": false 31 | }, 32 | "offload_param": { 33 | "device": "cpu", 34 | "pin_memory": true, 35 | "buffer_count": 5, 36 | "buffer_size": 1e8, 37 | "max_in_cpu": 1e9 38 | }, 39 | "allgather_partitions": true, 40 | "allgather_bucket_size": 5e8, 41 | "overlap_comm": true, 42 | "reduce_scatter": true, 43 | "reduce_bucket_size": 5e8, 44 | "contiguous_gradients": true, 45 | "stage3_max_live_parameters": 1e9, 46 | "stage3_max_reuse_distance": 1e9, 47 | "stage3_prefetch_bucket_size": 5e8, 48 | "stage3_param_persistence_threshold": 1e6 49 | }, 50 | "gradient_clipping": 1.0, 51 | "fp16": { 52 | "enabled": true, 53 | "loss_scale": 0, 54 | "initial_scale_power": 5, 55 | "loss_scale_window": 1000, 56 | "hysteresis": 2, 57 | "min_loss_scale": 1 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /tutorials/fashion_mnist_tf_keras/README.md: -------------------------------------------------------------------------------- 1 | # TensorFlow (tf.keras) Fashion MNIST Tutorial 2 | 3 | This tutorial shows how to build a simple CNN on the MNIST dataset using 4 | Determined's tf.keras API. This example is adapted from this [Keras image 5 | classification tutorial](https://www.tensorflow.org/tutorials/keras/classification). 6 | 7 | ## Files 8 | * **model_def.py**: The core code for the model. This includes building and compiling the model. 9 | * **data.py**: The data loading and preparation code for the model. 10 | 11 | ### Configuration Files 12 | * **const.yaml**: Train the model with constant hyperparameter values. 13 | * **distributed.yaml**: Same as `const.yaml`, but trains the model with multiple GPUs (distributed training). 14 | * **adaptive.yaml**: Perform a hyperparameter search using Determined's state-of-the-art adaptive hyperparameter tuning algorithm. 15 | 16 | ## Data 17 | The current implementation downloads the Fashion MNIST data from 18 | [here](https://github.com/zalandoresearch/fashion-mnist/blob/master/LICENSE). 19 | 20 | ## To Run 21 | If you have not yet installed Determined, installation instructions can be found 22 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html 23 | 24 | Run the following command: `det -m experiment create -f 25 | const.yaml .`. The other configurations can be run by specifying the appropriate 26 | configuration file in place of `const.yaml`. 27 | 28 | ## Results 29 | Training the model with the hyperparameter settings in `const.yaml` should yield 30 | a validation accuracy of ~85%. 31 | -------------------------------------------------------------------------------- /computer_vision/byol_pytorch/generate_blob_list.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from typing import Any, List 3 | 4 | from google.cloud import storage 5 | 6 | 7 | def list_blobs(storage_client: Any, bucket_name: str, prefix: str = None) -> List: 8 | # Helper functions for GCP from https://cloud.google.com/storage/docs/listing-objects#code-samples 9 | """Lists all the blobs in the bucket.""" 10 | blobs = storage_client.list_blobs(bucket_name, prefix=prefix) 11 | return blobs 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = ArgumentParser( 16 | description="""Generate a listing of all blobs in a given GCS bucket/path for consumption by GCSImageFolder. 17 | After running, upload the file to the GCS bucket and supply its path in data_config.gcs_train_blob_list_path or 18 | data_config.gcs_validation_blob_list_path. 19 | See distributed-imagenet.yaml for an example.""" 20 | ) 21 | parser.add_argument( 22 | "--bucket-name", 23 | type=str, 24 | required=True, 25 | help="Name of the GCS bucket, without gs:// prefix.", 26 | ) 27 | parser.add_argument("--bucket-path", type=str, required=True, help="Path prefix.") 28 | parser.add_argument("--output-file", type=str, required=True, help="File to output listing to.") 29 | args = parser.parse_args() 30 | storage_client = storage.Client() 31 | blobs = list_blobs(storage_client, args.bucket_name, prefix=args.bucket_path) 32 | with open(args.output_file, "w") as f: 33 | for b in blobs: 34 | f.write(b.name + "\n") 35 | -------------------------------------------------------------------------------- /hp_search_benchmarks/darts_penntreebank_pytorch/randomNAS_files/genotypes.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | Genotype = namedtuple("Genotype", "recurrent concat") 4 | 5 | PRIMITIVES = ["none", "tanh", "relu", "sigmoid", "identity"] 6 | STEPS = 8 7 | CONCAT = 8 8 | 9 | ENAS = Genotype( 10 | recurrent=[ 11 | ("tanh", 0), 12 | ("tanh", 1), 13 | ("relu", 1), 14 | ("tanh", 3), 15 | ("tanh", 3), 16 | ("relu", 3), 17 | ("relu", 4), 18 | ("relu", 7), 19 | ("relu", 8), 20 | ("relu", 8), 21 | ("relu", 8), 22 | ], 23 | concat=[2, 5, 6, 9, 10, 11], 24 | ) 25 | 26 | DARTS_V1 = Genotype( 27 | recurrent=[ 28 | ("relu", 0), 29 | ("relu", 1), 30 | ("tanh", 2), 31 | ("relu", 3), 32 | ("relu", 4), 33 | ("identity", 1), 34 | ("relu", 5), 35 | ("relu", 1), 36 | ], 37 | concat=range(1, 9), 38 | ) 39 | DARTS_V2 = Genotype( 40 | recurrent=[ 41 | ("sigmoid", 0), 42 | ("relu", 1), 43 | ("relu", 1), 44 | ("identity", 1), 45 | ("tanh", 2), 46 | ("sigmoid", 5), 47 | ("tanh", 3), 48 | ("relu", 5), 49 | ], 50 | concat=range(1, 9), 51 | ) 52 | 53 | DARTS = DARTS_V2 54 | 55 | ASHA = Genotype( 56 | recurrent=[ 57 | ("relu", 0), 58 | ("relu", 0), 59 | ("sigmoid", 0), 60 | ("tanh", 0), 61 | ("relu", 1), 62 | ("tanh", 0), 63 | ("identity", 5), 64 | ("sigmoid", 0), 65 | ], 66 | concat=range(1, 9), 67 | ) 68 | -------------------------------------------------------------------------------- /model_hub/huggingface/question-answering/squad.yaml: -------------------------------------------------------------------------------- 1 | name: huggingface_squad 2 | hyperparameters: 3 | pretrained_model_name_or_path: bert-base-uncased 4 | model_mode: question-answering 5 | use_pretrained_weights: true 6 | use_apex_amp: false 7 | cache_dir: null 8 | # Training Args 9 | global_batch_size: 12 10 | learning_rate: 3.0e-5 11 | adam_epsilon: 1.0e-8 12 | weight_decay: 0 13 | lr_scheduler_type: linear 14 | num_warmup_steps: 0 15 | data: 16 | dataset_name: squad 17 | train_file: null 18 | validation_file: null 19 | overwrite_cache: false 20 | preprocessing_num_workers: null 21 | max_seq_length: 384 22 | pad_to_max_length: true 23 | version_2_with_negative: false 24 | null_score_diff_threshold: 0 25 | doc_stride: 128 26 | n_best_size: 20 27 | max_answer_length: 30 28 | output_dir: /tmp 29 | # Number of records per epoch differs based on max_seq_length. 30 | records_per_epoch: 88524 31 | min_validation_period: 32 | batches: 5000 33 | searcher: 34 | name: single 35 | metric: f1 36 | max_length: 37 | epochs: 2 38 | smaller_is_better: false 39 | environment: 40 | image: 41 | gpu: determinedai/model-hub-transformers:0.26.2-dev0 42 | resources: 43 | slots_per_trial: 1 44 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the 45 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance. 46 | bind_mounts: 47 | - host_path: /tmp 48 | container_path: /root/.cache 49 | - host_path: /tmp 50 | container_path: /tmp 51 | entrypoint: qa_trial:QATrial 52 | -------------------------------------------------------------------------------- /features/checkpoint_hooks_pytorch/README.md: -------------------------------------------------------------------------------- 1 | # PyTorch MNIST CNN Tutorial 2 | This tutorial shows how to build a simple CNN on the MNIST dataset using 3 | Determined's PyTorch API and showcases PyTorchTrial checkpoint callbacks. 4 | This example is adapted from this [PyTorch MNIST 5 | tutorial](https://github.com/pytorch/examples/tree/master/mnist). 6 | 7 | ## Files 8 | * **model_def.py**: The core code for the model. This includes building and compiling the model. 9 | * **data.py**: The data loading and preparation code for the model. 10 | * **layers.py**: Defines the convolutional layers that the model uses. 11 | 12 | ### Configuration Files 13 | * **const.yaml**: Train the model with constant hyperparameter values. 14 | * **distributed.yaml**: Same as `const.yaml`, but trains the model with multiple GPUs (distributed training). 15 | * **adaptive.yaml**: Perform a hyperparameter search using Determined's state-of-the-art adaptive hyperparameter tuning algorithm. 16 | 17 | ## Data 18 | The current implementation uses MNIST data downloaded from AWS S3. 19 | 20 | ## To Run 21 | If you have not yet installed Determined, installation instructions can be found 22 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html 23 | 24 | Run the following command: `det -m experiment create -f 25 | const.yaml .`. The other configurations can be run by specifying the appropriate 26 | configuration file in place of `const.yaml`. 27 | 28 | ## Results 29 | Training the model with the hyperparameter settings in `const.yaml` should yield 30 | a validation accuracy of ~97%. 31 | -------------------------------------------------------------------------------- /model_hub/huggingface/question-answering/squad_v2.yaml: -------------------------------------------------------------------------------- 1 | name: huggingface_squad_v2 2 | hyperparameters: 3 | pretrained_model_name_or_path: bert-base-uncased 4 | model_mode: question-answering 5 | use_pretrained_weights: true 6 | use_apex_amp: false 7 | cache_dir: null 8 | # Training Args 9 | global_batch_size: 12 10 | learning_rate: 3.0e-5 11 | adam_epsilon: 1.0e-8 12 | weight_decay: 0 13 | lr_scheduler_type: linear 14 | num_warmup_steps: 0 15 | data: 16 | dataset_name: squad_v2 17 | train_file: null 18 | validation_file: null 19 | overwrite_cache: false 20 | preprocessing_num_workers: null 21 | max_seq_length: 384 22 | pad_to_max_length: true 23 | version_2_with_negative: true 24 | null_score_diff_threshold: 0 25 | doc_stride: 128 26 | n_best_size: 20 27 | max_answer_length: 30 28 | output_dir: /tmp 29 | # Number of records per epoch differs based on max_seq_length. 30 | records_per_epoch: 131754 31 | min_validation_period: 32 | batches: 5000 33 | searcher: 34 | name: single 35 | metric: f1 36 | max_length: 37 | epochs: 4 38 | smaller_is_better: false 39 | environment: 40 | image: 41 | gpu: determinedai/model-hub-transformers:0.26.2-dev0 42 | resources: 43 | slots_per_trial: 1 44 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the 45 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance. 46 | bind_mounts: 47 | - host_path: /tmp 48 | container_path: /root/.cache 49 | - host_path: /tmp 50 | container_path: /tmp 51 | entrypoint: qa_trial:QATrial 52 | -------------------------------------------------------------------------------- /model_hub/huggingface/question-answering/squad_beam_search.yaml: -------------------------------------------------------------------------------- 1 | name: huggingface_squad_with_beam_search 2 | hyperparameters: 3 | pretrained_model_name_or_path: xlnet-large-cased 4 | model_mode: question-answering 5 | use_pretrained_weights: true 6 | use_apex_amp: false 7 | cache_dir: null 8 | # Training Args 9 | global_batch_size: 4 10 | learning_rate: 3.0e-5 11 | adam_epsilon: 1.0e-8 12 | weight_decay: 0 13 | lr_scheduler_type: linear 14 | num_warmup_steps: 0 15 | data: 16 | dataset_name: squad 17 | train_file: null 18 | validation_file: null 19 | overwrite_cache: false 20 | preprocessing_num_workers: null 21 | max_seq_length: 384 22 | pad_to_max_length: true 23 | version_2_with_negative: false 24 | doc_stride: 128 25 | n_best_size: 20 26 | max_answer_length: 30 27 | output_dir: /tmp 28 | # Number of records per epoch differs based on max_seq_length. 29 | records_per_epoch: 88835 30 | min_validation_period: 31 | batches: 5000 32 | searcher: 33 | name: single 34 | metric: f1 35 | max_length: 36 | epochs: 2 37 | smaller_is_better: false 38 | environment: 39 | image: 40 | gpu: determinedai/model-hub-transformers:0.26.2-dev0 41 | resources: 42 | slots_per_trial: 1 43 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the 44 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance. 45 | bind_mounts: 46 | - host_path: /tmp 47 | container_path: /root/.cache 48 | - host_path: /tmp 49 | container_path: /tmp 50 | entrypoint: qa_beam_search_trial:QABeamSearchTrial 51 | -------------------------------------------------------------------------------- /features/torch_batch_process_core_api_comparison/README.md: -------------------------------------------------------------------------------- 1 | # Batch inference with Core API & Torch Batch Processing API 2 | 3 | ## Overview 4 | 5 | This example illustrates how to run distributed batch inference with Core API. Determined's Core API is very flexible 6 | and can be used to run almost anything, including batch inference. 7 | 8 | With Core API, we are able to write an example that 9 | - is distributed across worker 10 | - can be preempted and resumed 11 | - can be monitored on the Determined UI 12 | 13 | However, using Core API directly would require the user to directly handle 14 | - low-level parallel programming concepts such as gather, rank 15 | - Determined machinery such as creating and loading checkpoint, preemption and resumption 16 | - initialization of appropriate distributed context 17 | - proper sharding of dataset 18 | 19 | You will see that using the Torch Batch Processing API for the same task is a lot easier as it abstracted away all the 20 | low level details and provides useful helper functions. 21 | 22 | ## Detailed on this example 23 | 24 | We are running inference with a simple vision model on the CIFAR10 dataset. We then store the prediction outcome to the 25 | file system in the Core API example and to the same storage system used by Determined checkpoints in the 26 | `torch_batch_process` example. You can access the output through the underlying storage (e.g. s3 bucket, shared_fs). 27 | 28 | To run the Core API example, simply run `det e create core_api_config.yaml .` 29 | To run the Torch Batch Processing example, simply run `det e create torch_batch_process_config.yaml .` 30 | -------------------------------------------------------------------------------- /gan/cyclegan/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import datetime 4 | import sys 5 | 6 | from torch.autograd import Variable 7 | import torch 8 | import numpy as np 9 | 10 | from torchvision.utils import save_image 11 | 12 | 13 | class ReplayBuffer: 14 | def __init__(self, max_size=50): 15 | assert max_size > 0, "Empty buffer or trying to create a black hole. Be careful." 16 | self.max_size = max_size 17 | self.data = [] 18 | 19 | def push_and_pop(self, data): 20 | to_return = [] 21 | for element in data.data: 22 | element = torch.unsqueeze(element, 0) 23 | if len(self.data) < self.max_size: 24 | self.data.append(element) 25 | to_return.append(element) 26 | else: 27 | if random.uniform(0, 1) > 0.5: 28 | i = random.randint(0, self.max_size - 1) 29 | to_return.append(self.data[i].clone()) 30 | self.data[i] = element 31 | else: 32 | to_return.append(element) 33 | return Variable(torch.cat(to_return)) 34 | 35 | 36 | class LambdaLR: 37 | def __init__(self, n_epochs, offset, decay_start_epoch): 38 | assert (n_epochs - decay_start_epoch) > 0, "Decay must start before the training session ends!" 39 | self.n_epochs = n_epochs 40 | self.offset = offset 41 | self.decay_start_epoch = decay_start_epoch 42 | 43 | def step(self, epoch): 44 | return 1.0 - max(0, epoch + self.offset - self.decay_start_epoch) / (self.n_epochs - self.decay_start_epoch) 45 | -------------------------------------------------------------------------------- /gan/dcgan_tf_keras/README.md: -------------------------------------------------------------------------------- 1 | # DCGAN TensorFlow Keras GAN Example 2 | 3 | This example demonstrates how to build a simple GAN on the MNIST dataset using Determined's TensorFlow Keras API. This example is adapted from this [TensorFlow Tutorial](https://www.tensorflow.org/tutorials/generative/dcgan). 4 | The DCGAN Keras model featured in this example subclasses `tf.keras.Model` and defines a custom `train_step()` and `test_step()`. This functionality was first added in TensorFlow 2.2. 5 | 6 | ## Files 7 | * **dc_gan.py**: The code code defining the model. 8 | * **data.py**: The data loading and preparation code for the model. 9 | * **model_def.py**: Organizes the model into Determined's TensorFlow Keras API. 10 | * **export.py**: Exports a trained checkpoint and uses it to generate images. 11 | 12 | 13 | ### Configuration Files 14 | * **const.yaml**: Train the model with constant hyperparameter values. 15 | * **distributed.yaml**: Same as const.yaml, but instead uses multiple GPUs (distributed training). 16 | 17 | ## To Run 18 | Installation instructions can be found under `docs/install-admin.html` or at [Determined installation page](https://docs.determined.ai/latest/index.html). 19 | After configuring the settings in `const.yaml`, run the following command: `det -m experiment create -f const.yaml . ` 20 | 21 | ## To Export 22 | Once the model has been trained, its top checkpoint can be exported and used to generate images by running: 23 | ```bash 24 | python export.py --experiment-id --master-url 25 | ``` 26 | 27 | ![Generate Images](./images/dcgan_inference_example.png) 28 | -------------------------------------------------------------------------------- /model_hub/huggingface/question-answering/squad_distributed.yaml: -------------------------------------------------------------------------------- 1 | name: huggingface_squad_distributed 2 | hyperparameters: 3 | pretrained_model_name_or_path: bert-large-uncased-whole-word-masking 4 | model_mode: question-answering 5 | use_pretrained_weights: true 6 | use_apex_amp: false 7 | cache_dir: null 8 | # Training Args 9 | global_batch_size: 24 10 | learning_rate: 3e-5 11 | adam_epsilon: 1e-8 12 | weight_decay: 0 13 | lr_scheduler_type: linear 14 | num_warmup_steps: 0 15 | data: 16 | dataset_name: squad 17 | train_file: null 18 | validation_file: null 19 | overwrite_cache: false 20 | preprocessing_num_workers: null 21 | max_seq_length: 384 22 | pad_to_max_length: true 23 | version_2_with_negative: false 24 | null_score_diff_threshold: 0 25 | doc_stride: 128 26 | n_best_size: 20 27 | max_answer_length: 30 28 | output_dir: /tmp 29 | # Number of records per epoch differs based on max_seq_length. 30 | records_per_epoch: 88524 31 | min_validation_period: 32 | batches: 5000 33 | searcher: 34 | name: single 35 | metric: f1 36 | max_length: 37 | epochs: 2 38 | smaller_is_better: false 39 | environment: 40 | image: 41 | gpu: determinedai/model-hub-transformers:0.26.2-dev0 42 | resources: 43 | slots_per_trial: 8 44 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the 45 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance. 46 | bind_mounts: 47 | - host_path: /tmp 48 | container_path: /root/.cache 49 | - host_path: /tmp 50 | container_path: /tmp 51 | entrypoint: qa_trial:QATrial 52 | -------------------------------------------------------------------------------- /blog/lora-parameters/lora.yaml: -------------------------------------------------------------------------------- 1 | name: mistral lora hard 2 | debug: false 3 | environment: 4 | environment_variables: 5 | - NCCL_DEBUG=INFO 6 | - NCCL_SOCKET_IFNAME=ens,eth,ib 7 | image: 8 | gpu: determinedai/environments:cuda-11.8-pytorch-2.0-gpu-95c7a14 9 | cpu: determinedai/environments:py-3.10-pytorch-2.0-cpu-03ae7d7 10 | resources: 11 | slots_per_trial: 2 12 | resource_pool: # We used A100 40GB GPUs 13 | workspace: 14 | project: 15 | searcher: 16 | name: grid 17 | max_length: 18 | batches: 3000 19 | metric: eval_accuracy 20 | smaller_is_better: false 21 | hyperparameters: 22 | model: "mistralai/Mistral-7B-Instruct-v0.2" 23 | model_commit_hash: "99259002b41e116d28ccb2d04a9fbe22baed0c7f" 24 | dataset_subset: "hard" 25 | lora: true 26 | r: 27 | type: categorical 28 | vals: [2, 8, 32, 128] 29 | lora_alpha: 30 | type: categorical 31 | vals: [0.5, 1, 2, 8, 32, 128, 256, 512] 32 | lora_dropout: 33 | type: categorical 34 | vals: [0.1] 35 | hf_token: 36 | training_args: 37 | output_dir: "/tmp/llm_finetuning" 38 | max_steps: 3000 39 | per_device_train_batch_size: 4 40 | per_device_eval_batch_size: 4 41 | bf16: true 42 | evaluation_strategy: "steps" 43 | eval_steps: 500 44 | logging_strategy: "steps" 45 | logging_steps: 100 46 | save_strategy: "steps" 47 | save_steps: 1000 48 | learning_rate: 1e-5 49 | deepspeed: true 50 | gradient_checkpointing: true 51 | use_rslora: false 52 | entrypoint: >- 53 | python -m determined.launch.torch_distributed 54 | python finetune.py 55 | max_restarts: 0 -------------------------------------------------------------------------------- /model_hub/huggingface/question-answering/squad_v2_beam_search.yaml: -------------------------------------------------------------------------------- 1 | name: huggingface_squad_v2_with_beam_search 2 | hyperparameters: 3 | pretrained_model_name_or_path: xlnet-large-cased 4 | model_mode: question-answering 5 | use_pretrained_weights: true 6 | use_apex_amp: false 7 | cache_dir: null 8 | # Training Args 9 | global_batch_size: 4 10 | learning_rate: 3.0e-5 11 | adam_epsilon: 1.0e-8 12 | weight_decay: 0 13 | lr_scheduler_type: linear 14 | num_warmup_steps: 0 15 | data: 16 | dataset_name: squad_v2 17 | train_file: null 18 | validation_file: null 19 | overwrite_cache: false 20 | preprocessing_num_workers: null 21 | max_seq_length: 384 22 | pad_to_max_length: true 23 | version_2_with_negative: true 24 | null_score_diff_threshold: 0 25 | doc_stride: 128 26 | n_best_size: 20 27 | max_answer_length: 30 28 | output_dir: /tmp 29 | # Number of records per epoch differs based on max_seq_length. 30 | records_per_epoch: 132240 31 | min_validation_period: 32 | batches: 5000 33 | searcher: 34 | name: single 35 | metric: f1 36 | max_length: 37 | epochs: 4 38 | smaller_is_better: false 39 | environment: 40 | image: 41 | gpu: determinedai/model-hub-transformers:0.26.2-dev0 42 | resources: 43 | slots_per_trial: 1 44 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the 45 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance. 46 | bind_mounts: 47 | - host_path: /tmp 48 | container_path: /root/.cache 49 | - host_path: /tmp 50 | container_path: /tmp 51 | entrypoint: qa_beam_search_trial:QABeamSearchTrial 52 | -------------------------------------------------------------------------------- /computer_vision/detr_coco_pytorch/const_fake.yaml: -------------------------------------------------------------------------------- 1 | name: detr_coco_fake_data 2 | hyperparameters: 3 | lr: 1.0e-4 4 | lr_backbone: 1.0e-5 5 | global_batch_size: 2 6 | weight_decay: 1.0e-4 7 | lr_drop: 100 8 | clip_max_norm: 0.1 9 | 10 | # Set to true if you want to warmstart with pretrained weights. 11 | warmstart: false 12 | 13 | # Backbone 14 | backbone: resnet50 15 | dilation: false 16 | position_embedding: sine 17 | 18 | # Transformer 19 | enc_layers: 6 20 | dec_layers: 6 21 | dim_feedforward: 2048 22 | hidden_dim: 256 23 | dropout: 0.1 24 | nheads: 8 25 | num_queries: 100 26 | pre_norm: false 27 | 28 | # Loss 29 | aux_loss: true 30 | 31 | # Matcher 32 | set_cost_class: 1 33 | set_cost_bbox: 5 34 | set_cost_giou: 2 35 | 36 | # Loss Coefficients 37 | mask_loss_coef: 1 38 | dice_loss_coef: 1 39 | bbox_loss_coef: 5 40 | giou_loss_coef: 2 41 | eos_coef: 0.1 42 | 43 | # Dataset 44 | dataset_file: coco 45 | backend: fake # specifiy the backend you want to use. one of: gcs, aws, fake, local 46 | data_dir: /data # bucket name if using gcs or aws, otherwise directory to dataset 47 | masks: false 48 | num_workers: 4 49 | 50 | device: cuda 51 | 52 | bind_mounts: 53 | - host_path: /tmp 54 | container_path: /data 55 | read_only: false 56 | 57 | records_per_epoch: 117264 58 | searcher: 59 | name: single 60 | metric: mAP 61 | smaller_is_better: false 62 | max_length: 63 | batches: 100 64 | resources: 65 | shm_size: 2000000000 66 | 67 | entrypoint: model_def:DETRTrial 68 | -------------------------------------------------------------------------------- /features/checkpoint_hooks_pytorch/data.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | import urllib.parse 5 | from typing import Any, Dict 6 | 7 | import requests 8 | from torchvision import datasets, transforms 9 | 10 | 11 | def get_dataset(data_dir: str, train: bool) -> Any: 12 | return datasets.MNIST( 13 | data_dir, 14 | train=train, 15 | transform=transforms.Compose( 16 | [ 17 | transforms.ToTensor(), 18 | # These are the precomputed mean and standard deviation of the 19 | # MNIST data; this normalizes the data to have zero mean and unit 20 | # standard deviation. 21 | transforms.Normalize((0.1307,), (0.3081,)), 22 | ] 23 | ), 24 | ) 25 | 26 | 27 | def download_dataset(download_directory: str, data_config: Dict[str, Any]) -> str: 28 | url = data_config["url"] 29 | url_path = urllib.parse.urlparse(url).path 30 | basename = url_path.rsplit("/", 1)[1] 31 | 32 | download_directory = os.path.join(download_directory, "MNIST") 33 | os.makedirs(download_directory, exist_ok=True) 34 | filepath = os.path.join(download_directory, basename) 35 | if not os.path.exists(filepath): 36 | logging.info("Downloading {} to {}".format(url, filepath)) 37 | 38 | r = requests.get(url, stream=True) 39 | with open(filepath, "wb") as f: 40 | for chunk in r.iter_content(chunk_size=8192): 41 | if chunk: 42 | f.write(chunk) 43 | 44 | shutil.unpack_archive(filepath, download_directory) 45 | 46 | return os.path.dirname(download_directory) 47 | -------------------------------------------------------------------------------- /features/custom_reducers_mnist_pytorch/data.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | import urllib.parse 5 | from typing import Any, Dict 6 | 7 | import requests 8 | from torchvision import datasets, transforms 9 | 10 | 11 | def get_dataset(data_dir: str, train: bool) -> Any: 12 | return datasets.MNIST( 13 | data_dir, 14 | train=train, 15 | transform=transforms.Compose( 16 | [ 17 | transforms.ToTensor(), 18 | # These are the precomputed mean and standard deviation of the 19 | # MNIST data; this normalizes the data to have zero mean and unit 20 | # standard deviation. 21 | transforms.Normalize((0.1307,), (0.3081,)), 22 | ] 23 | ), 24 | ) 25 | 26 | 27 | def download_dataset(download_directory: str, data_config: Dict[str, Any]) -> str: 28 | url = data_config["url"] 29 | url_path = urllib.parse.urlparse(url).path 30 | basename = url_path.rsplit("/", 1)[1] 31 | 32 | download_directory = os.path.join(download_directory, "MNIST") 33 | os.makedirs(download_directory, exist_ok=True) 34 | filepath = os.path.join(download_directory, basename) 35 | if not os.path.exists(filepath): 36 | logging.info("Downloading {} to {}".format(url, filepath)) 37 | 38 | r = requests.get(url, stream=True) 39 | with open(filepath, "wb") as f: 40 | for chunk in r.iter_content(chunk_size=8192): 41 | if chunk: 42 | f.write(chunk) 43 | 44 | shutil.unpack_archive(filepath, download_directory) 45 | 46 | return os.path.dirname(download_directory) 47 | -------------------------------------------------------------------------------- /features/hp_constraints_mnist_pytorch/data.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | import urllib.parse 5 | from typing import Any, Dict 6 | 7 | import requests 8 | from torchvision import datasets, transforms 9 | 10 | 11 | def get_dataset(data_dir: str, train: bool) -> Any: 12 | return datasets.MNIST( 13 | data_dir, 14 | train=train, 15 | transform=transforms.Compose( 16 | [ 17 | transforms.ToTensor(), 18 | # These are the precomputed mean and standard deviation of the 19 | # MNIST data; this normalizes the data to have zero mean and unit 20 | # standard deviation. 21 | transforms.Normalize((0.1307,), (0.3081,)), 22 | ] 23 | ), 24 | ) 25 | 26 | 27 | def download_dataset(download_directory: str, data_config: Dict[str, Any]) -> str: 28 | url = data_config["url"] 29 | url_path = urllib.parse.urlparse(url).path 30 | basename = url_path.rsplit("/", 1)[1] 31 | 32 | download_directory = os.path.join(download_directory, "MNIST") 33 | os.makedirs(download_directory, exist_ok=True) 34 | filepath = os.path.join(download_directory, basename) 35 | if not os.path.exists(filepath): 36 | logging.info("Downloading {} to {}".format(url, filepath)) 37 | 38 | r = requests.get(url, stream=True) 39 | with open(filepath, "wb") as f: 40 | for chunk in r.iter_content(chunk_size=8192): 41 | if chunk: 42 | f.write(chunk) 43 | 44 | shutil.unpack_archive(filepath, download_directory) 45 | 46 | return os.path.dirname(download_directory) 47 | -------------------------------------------------------------------------------- /model_hub/huggingface/question-answering/squad_v2_albert.yaml: -------------------------------------------------------------------------------- 1 | name: huggingface_squad_v2_albert 2 | hyperparameters: 3 | pretrained_model_name_or_path: albert-xxlarge-v2 4 | model_mode: question-answering 5 | use_pretrained_weights: true 6 | use_apex_amp: false 7 | cache_dir: null 8 | # Training Args 9 | global_batch_size: 16 10 | learning_rate: 5e-5 11 | adam_epsilon: 1e-8 12 | weight_decay: 0 13 | lr_scheduler_type: linear 14 | num_warmup_steps: 1620 15 | data: 16 | dataset_name: squad_v2 17 | train_file: null 18 | validation_file: null 19 | overwrite_cache: false 20 | preprocessing_num_workers: null 21 | max_seq_length: 384 22 | pad_to_max_length: true 23 | version_2_with_negative: true 24 | null_score_diff_threshold: 0 25 | doc_stride: 128 26 | n_best_size: 20 27 | max_answer_length: 30 28 | output_dir: /tmp 29 | optimizations: 30 | aggregation_frequency: 3 31 | # Number of records per epoch differs based on max_seq_length. 32 | records_per_epoch: 131754 33 | min_validation_period: 34 | batches: 5000 35 | searcher: 36 | name: single 37 | metric: f1 38 | max_length: 39 | batches: 16500 40 | smaller_is_better: false 41 | environment: 42 | image: 43 | gpu: determinedai/model-hub-transformers:0.26.2-dev0 44 | resources: 45 | slots_per_trial: 8 46 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the 47 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance. 48 | bind_mounts: 49 | - host_path: /tmp 50 | container_path: /root/.cache 51 | - host_path: /tmp 52 | container_path: /tmp 53 | entrypoint: qa_trial:QATrial 54 | -------------------------------------------------------------------------------- /blog/python_sdk_demo/mednist_model/net.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | # from https://github.com/MedMNIST/MedMNIST/blob/main/examples/getting_started.ipynb 5 | class Net(nn.Module): 6 | def __init__(self, in_channels, num_classes): 7 | super().__init__() 8 | 9 | self.layer1 = nn.Sequential( 10 | nn.Conv2d(in_channels, 16, kernel_size=3), nn.BatchNorm2d(16), nn.ReLU() 11 | ) 12 | 13 | self.layer2 = nn.Sequential( 14 | nn.Conv2d(16, 16, kernel_size=3), 15 | nn.BatchNorm2d(16), 16 | nn.ReLU(), 17 | nn.MaxPool2d(kernel_size=2, stride=2), 18 | ) 19 | 20 | self.layer3 = nn.Sequential( 21 | nn.Conv2d(16, 64, kernel_size=3), nn.BatchNorm2d(64), nn.ReLU() 22 | ) 23 | 24 | self.layer4 = nn.Sequential( 25 | nn.Conv2d(64, 64, kernel_size=3), nn.BatchNorm2d(64), nn.ReLU() 26 | ) 27 | 28 | self.layer5 = nn.Sequential( 29 | nn.Conv2d(64, 64, kernel_size=3, padding=1), 30 | nn.BatchNorm2d(64), 31 | nn.ReLU(), 32 | nn.MaxPool2d(kernel_size=2, stride=2), 33 | ) 34 | 35 | self.fc = nn.Sequential( 36 | nn.Linear(64 * 4 * 4, 128), 37 | nn.ReLU(), 38 | nn.Linear(128, 128), 39 | nn.ReLU(), 40 | nn.Linear(128, num_classes), 41 | ) 42 | 43 | def forward(self, x): 44 | x = self.layer1(x) 45 | x = self.layer2(x) 46 | x = self.layer3(x) 47 | x = self.layer4(x) 48 | x = self.layer5(x) 49 | x = x.view(x.size(0), -1) 50 | x = self.fc(x) 51 | return x 52 | -------------------------------------------------------------------------------- /custom_search_method/asha_search_method/experiment_files/data.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import shutil 4 | import urllib.parse 5 | from typing import Any, Dict 6 | 7 | import requests 8 | from torchvision import datasets, transforms 9 | 10 | 11 | def get_dataset(data_dir: str, train: bool) -> Any: 12 | return datasets.MNIST( 13 | data_dir, 14 | train=train, 15 | transform=transforms.Compose( 16 | [ 17 | transforms.ToTensor(), 18 | # These are the precomputed mean and standard deviation of the 19 | # MNIST data; this normalizes the data to have zero mean and unit 20 | # standard deviation. 21 | transforms.Normalize((0.1307,), (0.3081,)), 22 | ] 23 | ), 24 | ) 25 | 26 | 27 | def download_dataset(download_directory: str, data_config: Dict[str, Any]) -> str: 28 | url = data_config["url"] 29 | url_path = urllib.parse.urlparse(url).path 30 | basename = url_path.rsplit("/", 1)[1] 31 | 32 | download_directory = os.path.join(download_directory, "MNIST") 33 | os.makedirs(download_directory, exist_ok=True) 34 | filepath = os.path.join(download_directory, basename) 35 | if not os.path.exists(filepath): 36 | logging.info("Downloading {} to {}".format(url, filepath)) 37 | 38 | r = requests.get(url, stream=True) 39 | with open(filepath, "wb") as f: 40 | for chunk in r.iter_content(chunk_size=8192): 41 | if chunk: 42 | f.write(chunk) 43 | 44 | shutil.unpack_archive(filepath, download_directory) 45 | 46 | return os.path.dirname(download_directory) 47 | -------------------------------------------------------------------------------- /model_hub/mmdetection/hydra/README.md: -------------------------------------------------------------------------------- 1 | # Using model-hub mmdetection with [Hydra](https://hydra.cc/) 2 | Hydra is a framework for configuring applications that works very well with machine learning experiments. 3 | You can use Determined's Python SDK with Hydra to: 4 | * Easily submit experiments with different configurations 5 | * Perform parameter sweeps 6 | * Compose configurations 7 | 8 | ## Setup 9 | You need to install Determined and Hydra in order to try this out. 10 | ``` 11 | pip install hydra-core>=1.1 12 | pip install determined 13 | ``` 14 | 15 | ## Submitting experiments 16 | Make sure the `DET_MASTER` environment variable is set. Then you can create experiments by running 17 | ``` 18 | python mmdet_experiment.py hyperparameters.config_file=mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py 19 | ``` 20 | 21 | Hydra makes it easy to modify the configuration from the CLI: 22 | ``` 23 | python mmdet_experiment.py hyperparameters.config_file=faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py 24 | ``` 25 | 26 | Or try multiple values: 27 | ``` 28 | python mmdet_experiment.py --multirun \ 29 | hyperparameters.config_file=faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py,detr/detr_r50_8x2_150e_coco.py 30 | ``` 31 | 32 | Configuration with Hydra is also highly flexible and extensible. 33 | For example, you can run hyperparameter search on the optimizer learning rate by 34 | ``` 35 | python mmdet_experiment.py searcher=adaptive +hyperparameters=tune_optimizer hyperparameters.config_file=mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py 36 | ``` 37 | You can look the [config directory](configs) to see how we use some of this functionality. Feel free to add your own configs as needed to further customize the behavior. 38 | -------------------------------------------------------------------------------- /computer_vision/iris_tf_keras/README.md: -------------------------------------------------------------------------------- 1 | # TensorFlow (tf.keras) Iris Species Categorization Example 2 | 3 | This example shows how to run a CNN on the Iris species dataset using 4 | Determined's tf.keras API. This example is adapted from this [Iris species 5 | categorization medium post](https://medium.com/@nickbortolotti/iris-species-categorization-using-tf-keras-tf-data-and-differences-between-eager-mode-on-and-off-9b4693e0b22). 6 | 7 | ## Files 8 | * **model_def.py**: The core code for the model. This includes building and compiling the model. 9 | * **startup-hook.sh**: Additional dependencies that Determined will automatically install into each container for this experiment. 10 | 11 | ### Configuration Files 12 | * **const.yaml**: Train the model with constant hyperparameter values. 13 | * **distributed.yaml**: Same as `const.yaml`, but trains the model with multiple GPUs (distributed training). 14 | * **adaptive.yaml**: Perform a hyperparameter search using Determined's state-of-the-art adaptive hyperparameter tuning algorithm. 15 | 16 | ## Data: 17 | The current implementation uses [UCI's Iris Data Set](https://archive.ics.uci.edu/ml/datasets/iris). 18 | 19 | ## To Run: 20 | If you have not yet installed Determined, installation instructions can be found 21 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html 22 | 23 | Run the following command: `det -m experiment create -f 24 | const.yaml .`. The other configurations can be run by specifying the appropriate 25 | configuration file in place of `const.yaml`. 26 | 27 | ## Results: 28 | Training the model with the hyperparameter settings in `const.yaml` should yield 29 | a validation accuracy of ~95%. 30 | -------------------------------------------------------------------------------- /blog/llm-finetuning-3/dpo.yaml: -------------------------------------------------------------------------------- 1 | name: gemma-2b dpo 2 | debug: false 3 | environment: 4 | environment_variables: 5 | - NCCL_DEBUG=INFO 6 | image: determinedai/genai-train:latest 7 | resources: 8 | slots_per_trial: 2 9 | resource_pool: A100 10 | max_slots: 8 11 | searcher: 12 | name: grid 13 | max_length: 14 | batches: 5000 15 | metric: eval_accuracy 16 | smaller_is_better: false 17 | hyperparameters: 18 | model_name: "google/gemma-2b-it" 19 | # model_ckpt: "6b6fbaa7-faa9-4449-867b-2939a147a335" 20 | datasets: 21 | - "argilla/dpo-mix-7k" 22 | - "jondurbin/truthy-dpo-v0.1" 23 | dpo_beta: 24 | type: categorical 25 | vals: 26 | - 0.1 27 | - 0.05 28 | - 0.01 29 | dpo_loss: "sigmoid" 30 | max_length: 4096 31 | max_prompt_length: 2048 32 | max_target_length: 2048 33 | precompute_ref_log_probs: true 34 | training_args: 35 | output_dir: "/tmp/llm_finetuning" 36 | num_train_epochs: 2 37 | per_device_train_batch_size: 1 38 | per_device_eval_batch_size: 1 39 | bf16: true 40 | bf16_full_eval: true 41 | evaluation_strategy: "steps" 42 | eval_steps: 100 43 | logging_strategy: "steps" 44 | logging_steps: 10 45 | save_strategy: "epoch" 46 | save_steps: 1 47 | learning_rate: 48 | type: categorical 49 | vals: 50 | - 1e-7 51 | - 5e-7 52 | - 5e-8 53 | gradient_accumulation_steps: 8 54 | gradient_checkpointing: true 55 | deepspeed: "ds_configs/ds_config_stage_2.json" 56 | warmup_ratio: 0.1 57 | lr_scheduler_type: "cosine" 58 | optim: "adamw_torch" 59 | entrypoint: >- 60 | python -m determined.launch.deepspeed 61 | python dpo_finetune.py 62 | max_restarts: 0 -------------------------------------------------------------------------------- /computer_vision/detr_coco_pytorch/data_utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | from io import BytesIO 4 | from shutil import unpack_archive 5 | from tempfile import NamedTemporaryFile 6 | 7 | import requests 8 | 9 | 10 | def download_file(url, output_dir): 11 | local_filename = os.path.join(output_dir, url.split("/")[-1]) 12 | with requests.get(url, stream=True) as r: 13 | r.raise_for_status() 14 | with open(local_filename, "wb") as f: 15 | for chunk in r.iter_content(chunk_size=8192): 16 | # If you have chunk encoded response uncomment the line below and set chunk_size parameter to None. 17 | # if chunk: 18 | f.write(chunk) 19 | return local_filename 20 | 21 | 22 | async def download_and_extract_url(zipurl, outdir): 23 | filename = download_file(zipurl, outdir) 24 | with open(filename, "rb") as f, NamedTemporaryFile() as tfile: 25 | tfile.write(f.read()) 26 | tfile.seek(0) 27 | unpack_archive(tfile.name, outdir, format="zip") 28 | print("finished extracting: {}".format(zipurl)) 29 | await asyncio.sleep(1) 30 | 31 | 32 | def async_download_url_list(url_list, outdir): 33 | loop = asyncio.get_event_loop() 34 | tasks = [asyncio.ensure_future(download_and_extract_url(url, outdir)) for url in url_list] 35 | loop.run_until_complete(asyncio.gather(*tasks)) 36 | 37 | 38 | def download_coco_from_source(data_dir): 39 | url_list = [ 40 | "http://images.cocodataset.org/zips/train2017.zip", 41 | "http://images.cocodataset.org/zips/val2017.zip", 42 | ] 43 | async_download_url_list(url_list, data_dir) 44 | 45 | 46 | if __name__ == "__main__": 47 | download_coco_from_source("/tmp") 48 | -------------------------------------------------------------------------------- /custom_search_method/asha_search_method/remote_search_runner/README.md: -------------------------------------------------------------------------------- 1 | # Custom SearchMethod with RemoteSearchRunner 2 | 3 | In this example, we use RemoteSearchRunner, which executes a custom SearchMethod as a single trial experiment and 4 | orchestrates a multi-trial experiment. Both the custom SearchMethod and the multi-trial experiment are executed 5 | on the Determined cluster. 6 | 7 | For an example of running the custom SearchMethod locally, 8 | see `examples/custom_search_method/asha_custom_search_method/local_search_runner`. 9 | 10 | ## Files 11 | * **run_experiment.py**: The code for running a custom SearchMethod with RemoteSearchRunner. 12 | 13 | ### Configuration Files 14 | * **searcher.yaml**: Configuration for running custom SearchMethod as an experiment on the Determined cluster. 15 | 16 | 17 | ## To Run 18 | If you have not yet installed Determined, installation instructions can be found 19 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html 20 | 21 | 1. Set the `DET_MASTER` environment variable, which is the network address of the Determined master. 22 | For instance, `export DET_MASTER=`. 23 | 2. Run the following command in the `asha_search_method` directory to start RemoteSearchRunner on the Determined cluster: 24 | `det experiment create remote_search_runner/searcher.yaml .`. 25 | 26 | ## Result 27 | RemoteSearchRunner is submitted to the Determined master as a single trial experiment. 28 | While running on the cluster, RemoteSearchRunner executes the custom SearchMethod and starts a multi-trial experiment 29 | for hyperparameter search. Similarly to LocalSearchRunner, RemoteSearchRunner handles the communication between the 30 | custom SearchMethod and the multi-trial experiment. -------------------------------------------------------------------------------- /computer_vision/deformabledetr_coco_pytorch/data_utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | from io import BytesIO 4 | from shutil import unpack_archive 5 | from tempfile import NamedTemporaryFile 6 | 7 | import requests 8 | 9 | 10 | def download_file(url, output_dir): 11 | local_filename = os.path.join(output_dir, url.split("/")[-1]) 12 | with requests.get(url, stream=True) as r: 13 | r.raise_for_status() 14 | with open(local_filename, "wb") as f: 15 | for chunk in r.iter_content(chunk_size=8192): 16 | # If you have chunk encoded response uncomment the line below and set chunk_size parameter to None. 17 | # if chunk: 18 | f.write(chunk) 19 | return local_filename 20 | 21 | 22 | async def download_and_extract_url(zipurl, outdir): 23 | filename = download_file(zipurl, outdir) 24 | with open(filename, "rb") as f, NamedTemporaryFile() as tfile: 25 | tfile.write(f.read()) 26 | tfile.seek(0) 27 | unpack_archive(tfile.name, outdir, format="zip") 28 | print("finished extracting: {}".format(zipurl)) 29 | await asyncio.sleep(1) 30 | 31 | 32 | def async_download_url_list(url_list, outdir): 33 | loop = asyncio.get_event_loop() 34 | tasks = [asyncio.ensure_future(download_and_extract_url(url, outdir)) for url in url_list] 35 | loop.run_until_complete(asyncio.gather(*tasks)) 36 | 37 | 38 | def download_coco_from_source(data_dir): 39 | url_list = [ 40 | "http://images.cocodataset.org/zips/train2017.zip", 41 | "http://images.cocodataset.org/zips/val2017.zip", 42 | ] 43 | async_download_url_list(url_list, data_dir) 44 | 45 | 46 | if __name__ == "__main__": 47 | download_coco_from_source("/tmp") 48 | -------------------------------------------------------------------------------- /blog/python_sdk_demo/README.md: -------------------------------------------------------------------------------- 1 | # det-python-sdk-demo 2 | 3 | ## Overview 4 | 5 | This script shows example usage of the Determined Python SDK to run and administer experiments. 6 | 7 | It: 8 | 1. Archives any existing experiments with the same names as the datasets we'll train on. 9 | 2. Creates models for each dataset and registers them in the Determined model registry. 10 | 3. Trains a model for each dataset by creating an experiment. 11 | 4. Registers the best checkpoint for each experiment in the Determined model registry. 12 | 13 | For an in-depth discussion of this script, see the blog post: 14 | https://www.determined.ai/blog/python-sdk 15 | 16 | For more information on the Determined Python SDK, see: 17 | https://docs.determined.ai/latest/reference/python-sdk.html 18 | 19 | ## Installation / Execution 20 | 21 | To run this demo: 22 | 23 | 1. Install dependencies. In addition to the determined CLI, we this demo uses MedMNIST datasets. 24 | ``` 25 | pip install -r requirements.txt 26 | ``` 27 | 28 | 2. Set DET_MASTER environment variable. For example, if you're running this locally: 29 | ``` 30 | export DET_MASTER=localhost:8080 31 | ``` 32 | 33 | For more information about configuring the CLI, see [this doc](https://docs.determined.ai/latest/setup-cluster/setup-clients.html#setting-up-clients). 34 | 35 | 3. Now the demo is ready to be executed. To run experiments: 36 | ``` 37 | python determined_sdk_demo.py 38 | ``` 39 | 40 | ## Contributors 41 | 42 | - [Wesley Turner](https://github.com/wes-turner) 43 | - [Kevin Musgrave](https://github.com/KevinMusgrave) 44 | 45 | The code in the `medmnist_model` directory is based on the [`determined_medmnist_e2e`](https://github.com/ighodgao/determined_medmnist_e2e) repo by [Isha Ghodgaonkar](https://github.com/ighodgao). -------------------------------------------------------------------------------- /gan/dcgan_tf_keras/export.py: -------------------------------------------------------------------------------- 1 | """ 2 | Shows an example of how model trained in Determined can be easily exported and used. 3 | """ 4 | 5 | import argparse 6 | 7 | import matplotlib.pyplot as plt 8 | import tensorflow as tf 9 | 10 | from determined.experimental import client 11 | 12 | 13 | def generate_and_plot_images(generator: tf.keras.Sequential, noise_dim: int) -> None: 14 | # Notice `training` is set to False. 15 | # This is so all layers run in inference mode (batchnorm). 16 | seed = tf.random.normal([16, noise_dim]) 17 | predictions = generator(seed, training=False) 18 | 19 | plt.figure(figsize=(4, 4)) 20 | 21 | for i in range(predictions.shape[0]): 22 | plt.subplot(4, 4, i + 1) 23 | plt.imshow(predictions[i, :, :, 0] * 127.5 + 127.5, cmap="gray") 24 | plt.axis("off") 25 | plt.show() 26 | 27 | 28 | def export_model(experiment_id: int) -> tf.keras.Model: 29 | checkpoint = client.get_experiment(experiment_id).top_checkpoint() 30 | model = checkpoint.load() 31 | return model 32 | 33 | 34 | def main(): 35 | parser = argparse.ArgumentParser(description="DCGan Model Export") 36 | parser.add_argument("--experiment-id", type=int, required=True, help="Experiment ID to export.") 37 | parser.add_argument("--master-url", type=str, default="", help="URL of the Determined master.") 38 | parser.add_argument( 39 | "--noise-dim", 40 | type=int, 41 | default=128, 42 | help="Needs to match noise dim during training.", 43 | ) 44 | args = parser.parse_args() 45 | 46 | client.login(args.master_url) 47 | model = export_model(args.experiment_id) 48 | generate_and_plot_images(model.generator, args.noise_dim) 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /deepspeed/pipeline_parallelism/alexnet.py: -------------------------------------------------------------------------------- 1 | # Implementation of AlexNet for illustrative purposes. The train.py driver 2 | # can import AlexNet from here or directly from torchvision. 3 | # 4 | # Taken from torchvision.models.alexnet: 5 | # https://pytorch.org/docs/1.6.0/_modules/torchvision/models/alexnet.html#alexnet 6 | 7 | 8 | import torch 9 | import torch.nn as nn 10 | 11 | 12 | class AlexNet(nn.Module): 13 | def __init__(self, num_classes=1000): 14 | super(AlexNet, self).__init__() 15 | self.features = nn.Sequential( 16 | nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), 17 | nn.ReLU(inplace=True), 18 | nn.MaxPool2d(kernel_size=3, stride=2), 19 | nn.Conv2d(64, 192, kernel_size=5, padding=2), 20 | nn.ReLU(inplace=True), 21 | nn.MaxPool2d(kernel_size=3, stride=2), 22 | nn.Conv2d(192, 384, kernel_size=3, padding=1), 23 | nn.ReLU(inplace=True), 24 | nn.Conv2d(384, 256, kernel_size=3, padding=1), 25 | nn.ReLU(inplace=True), 26 | nn.Conv2d(256, 256, kernel_size=3, padding=1), 27 | nn.ReLU(inplace=True), 28 | nn.MaxPool2d(kernel_size=3, stride=2), 29 | ) 30 | self.avgpool = nn.AdaptiveAvgPool2d((6, 6)) 31 | self.classifier = nn.Sequential( 32 | nn.Dropout(), 33 | nn.Linear(256 * 6 * 6, 4096), 34 | nn.ReLU(inplace=True), 35 | nn.Dropout(), 36 | nn.Linear(4096, 4096), 37 | nn.ReLU(inplace=True), 38 | nn.Linear(4096, num_classes), 39 | ) 40 | 41 | def forward(self, x): 42 | x = self.features(x) 43 | x = self.avgpool(x) 44 | x = torch.flatten(x, 1) 45 | x = self.classifier(x) 46 | return x 47 | --------------------------------------------------------------------------------