├── blog
    ├── llm-finetuning
    │   ├── .detignore
    │   ├── .gitignore
    │   ├── startup-hook.sh
    │   ├── requirements.txt
    │   ├── distributed.yaml
    │   ├── chat_format.py
    │   └── README.md
    ├── llm-finetuning-2
    │   ├── .detignore
    │   ├── .gitignore
    │   ├── startup-hook.sh
    │   ├── requirements.txt
    │   ├── deepspeed.yaml
    │   ├── lora.yaml
    │   └── ds_configs
    │   │   ├── ds_config_stage_1.json
    │   │   ├── ds_config_stage_2.json
    │   │   ├── ds_config_stage_3.json
    │   │   └── ds_config_stage_2_cpu_offload.json
    ├── llm-finetuning-3
    │   ├── .detignore
    │   ├── .gitignore
    │   ├── startup-hook.sh
    │   ├── requirements.txt
    │   ├── chat_format.py
    │   ├── ds_configs
    │   │   ├── ds_config_stage_2.json
    │   │   ├── ds_config_stage_3.json
    │   │   ├── ds_config_stage_1.json
    │   │   └── ds_config_stage_2_cpu_offload.json
    │   └── dpo.yaml
    ├── lora-parameters
    │   ├── .detignore
    │   ├── .gitignore
    │   ├── startup-hook.sh
    │   ├── requirements.txt
    │   ├── README.md
    │   ├── ds_configs
    │   │   └── ds_config_stage_3.json
    │   └── lora.yaml
    ├── python_sdk_demo
    │   ├── mednist_model
    │   │   ├── requirements.txt
    │   │   ├── startup-hook.sh
    │   │   ├── config.yaml
    │   │   └── net.py
    │   ├── requirements.txt
    │   └── README.md
    ├── tp
    │   ├── matmul.png
    │   ├── mlp_tp.png
    │   ├── README.md
    │   ├── matmul_profiling.yaml
    │   ├── tp_profiling.yaml
    │   └── test_dot_product_local.py
    ├── README.md
    └── act-mem-2
    │   ├── requirements.txt
    │   ├── README.md
    │   ├── attn_script.py
    │   ├── mlp_script.py
    │   └── block_script.py
├── computer_vision
    ├── detectron2_coco_pytorch
    │   ├── .detignore
    │   ├── metrics_by_time.png
    │   ├── Makefile
    │   ├── const_fake.yaml
    │   ├── mask_rcnn_R_50_FPN_noaug_1x.yaml
    │   ├── const.yaml
    │   ├── distributed.yaml
    │   ├── Dockerfile
    │   └── Base-RCNN-FPN.yaml
    ├── iris_tf_keras
    │   ├── startup-hook.sh
    │   ├── const.yaml
    │   ├── distributed.yaml
    │   ├── adaptive.yaml
    │   └── README.md
    ├── detr_coco_pytorch
    │   ├── imgs
    │   │   ├── val_curves.png
    │   │   ├── train_curves.png
    │   │   └── detr_architecture.png
    │   ├── startup-hook.sh
    │   ├── const_fake.yaml
    │   └── data_utils.py
    ├── efficientdet_pytorch
    │   ├── loss_by_gpus.png
    │   ├── Samples_per_sec.png
    │   ├── startup-hook.sh
    │   └── efficientdet_files
    │   │   └── utils.py
    ├── unets_tf_keras
    │   ├── Cumulative_Batches.png
    │   ├── Validation_Accuracy.png
    │   ├── startup-hook.sh
    │   ├── const.yaml
    │   └── distributed.yaml
    ├── cifar10_pytorch_inference
    │   ├── startup-hook.sh
    │   └── const.yaml
    ├── deformabledetr_coco_pytorch
    │   ├── imgs
    │   │   ├── val_curves.png
    │   │   └── train_curves.png
    │   ├── startup-hook.sh
    │   └── data_utils.py
    ├── byol_pytorch
    │   ├── startup-hook.sh
    │   ├── backbone.py
    │   ├── utils.py
    │   ├── evaluate_result.py
    │   └── generate_blob_list.py
    ├── fasterrcnn_coco_pytorch
    │   ├── const.yaml
    │   ├── adaptive.yaml
    │   └── README.md
    ├── cifar10_pytorch
    │   ├── const.yaml
    │   ├── distributed.yaml
    │   ├── adaptive.yaml
    │   ├── distributed_inference.yaml
    │   └── README.md
    └── cifar10_tf_keras
    │   ├── const.yaml
    │   ├── distributed.yaml
    │   ├── adaptive.yaml
    │   └── README.md
├── gan
    ├── pix2pix_tf_keras
    │   ├── .gitignore
    │   ├── .detignore
    │   ├── images
    │   │   ├── batches_vs_time.jpg
    │   │   ├── generated_example.jpeg
    │   │   ├── training_loss_vs_time.jpg
    │   │   └── validation_loss_vs_time.jpg
    │   ├── print_models.py
    │   ├── const.yaml
    │   ├── distributed.yaml
    │   ├── adaptive.yaml
    │   └── pix2pix
    │   │   └── sampling.py
    ├── dcgan_tf_keras
    │   ├── images
    │   │   └── dcgan_inference_example.png
    │   ├── const.yaml
    │   ├── distributed.yaml
    │   ├── data.py
    │   ├── README.md
    │   └── export.py
    ├── gan_mnist_pytorch
    │   ├── const.yaml
    │   ├── distributed.yaml
    │   ├── README.md
    │   └── data.py
    └── cyclegan
    │   ├── 1-gpu.yaml
    │   ├── 8-gpus.yaml
    │   ├── 64-gpus.yaml
    │   ├── startup-hook.sh
    │   ├── datasets.py
    │   └── utils.py
├── model_hub
    ├── mmdetection
    │   ├── hydra
    │   │   ├── configs
    │   │   │   ├── profiling
    │   │   │   │   ├── disabled.yaml
    │   │   │   │   └── enabled.yaml
    │   │   │   ├── data
    │   │   │   │   ├── disk.yaml
    │   │   │   │   ├── fake.yaml
    │   │   │   │   ├── gcs.yaml
    │   │   │   │   └── s3.yaml
    │   │   │   ├── hyperparameters
    │   │   │   │   ├── fp16.yaml
    │   │   │   │   ├── ann_file.yaml
    │   │   │   │   ├── base.yaml
    │   │   │   │   ├── tune_optimizer.yaml
    │   │   │   │   └── grad_clip.yaml
    │   │   │   ├── searcher
    │   │   │   │   ├── single.yaml
    │   │   │   │   └── adaptive.yaml
    │   │   │   └── config.yaml
    │   │   ├── mmdet_experiment.py
    │   │   └── README.md
    │   └── fasterrcnn.png
    └── huggingface
    │   ├── multiple-choice
    │       ├── figures
    │       │   └── swag.png
    │       └── swag_config.yaml
    │   ├── language-modeling
    │       ├── figures
    │       │   ├── clm.png
    │       │   ├── mlm.png
    │       │   └── plm.png
    │       ├── clm_config.yaml
    │       ├── mlm_config.yaml
    │       └── plm_config.yaml
    │   ├── question-answering
    │       ├── figures
    │       │   ├── squad.png
    │       │   ├── squad_v2.png
    │       │   ├── squad_v2_albert.png
    │       │   ├── squad_beam_search.png
    │       │   ├── squad_distributed.png
    │       │   └── squad_v2_beam_search.png
    │       ├── squad.yaml
    │       ├── squad_v2.yaml
    │       ├── squad_beam_search.yaml
    │       ├── squad_distributed.yaml
    │       ├── squad_v2_beam_search.yaml
    │       └── squad_v2_albert.yaml
    │   ├── text-classification
    │       ├── figures
    │       │   ├── glue.png
    │       │   └── xnli.png
    │       └── xnli_config.yaml
    │   └── token-classification
    │       ├── figures
    │           └── ner.png
    │       └── ner_config.yaml
├── features
    ├── ports_flask
    │   ├── startup-hook.sh
    │   ├── hello-client.yaml
    │   ├── hello-server.yaml
    │   ├── hello-server.py
    │   └── README.md
    ├── torch_batch_process_embeddings
    │   ├── startup-hook.sh
    │   ├── requirements.txt
    │   └── distributed.yaml
    ├── torch_batch_process_core_api_comparison
    │   ├── constants.py
    │   ├── core_api_config.yaml
    │   ├── torch_batch_process_config.yaml
    │   ├── model.py
    │   └── README.md
    ├── checkpoint_hooks_pytorch
    │   ├── const.yaml
    │   ├── layers.py
    │   ├── README.md
    │   └── data.py
    ├── custom_reducers_mnist_pytorch
    │   ├── const.yaml
    │   ├── layers.py
    │   ├── distributed.yaml
    │   ├── README.md
    │   └── data.py
    └── hp_constraints_mnist_pytorch
    │   ├── layers.py
    │   ├── adaptive.yaml
    │   ├── README.md
    │   └── data.py
├── hp_search_benchmarks
    ├── darts_cifar10_pytorch
    │   ├── startup-hook.sh
    │   ├── figures
    │   │   └── constrained_adaptive.png
    │   └── genotypes.py
    └── darts_penntreebank_pytorch
    │   ├── startup-hook.sh
    │   ├── const.yaml
    │   └── randomNAS_files
    │       └── genotypes.py
├── meta_learning
    └── protonet_omniglot_pytorch
    │   ├── startup-hook.sh
    │   ├── omniglot_20w1s.png
    │   ├── fetch_data.sh
    │   ├── 20way1shot.yaml
    │   └── 20way5shot.yaml
├── nas
    └── gaea_pytorch
    │   ├── eval
    │       └── top5_val.png
    │   └── search
    │       ├── optimizer.py
    │       ├── const.yaml
    │       └── data.py
├── nlp
    ├── bert_glue_pytorch
    │   ├── startup-hook.sh
    │   ├── const.yaml
    │   ├── distributed.yaml
    │   └── constants.py
    ├── word_language_model
    │   ├── validation_loss_table.png
    │   ├── const.yaml
    │   └── distributed.yaml
    └── albert_squad_pytorch
    │   ├── startup-hook.sh
    │   ├── constants.py
    │   ├── distributed_8gpu.yaml
    │   ├── distributed_64gpu.yaml
    │   └── const.yaml
├── graphs
    └── proteins_pytorch_geometric
    │   ├── startup-hook.sh
    │   ├── const.yaml
    │   ├── distributed.yaml
    │   └── adaptive.yaml
├── custom_search_method
    └── asha_search_method
    │   ├── remote_search_runner
    │       ├── searcher.yaml
    │       └── README.md
    │   ├── experiment_files
    │       ├── config.yaml
    │       ├── layers.py
    │       └── data.py
    │   └── local_search_runner
    │       └── README.md
├── tutorials
    └── fashion_mnist_tf_keras
    │   ├── const.yaml
    │   ├── distributed.yaml
    │   ├── adaptive.yaml
    │   └── README.md
├── deepspeed
    ├── deepspeed_dcgan
    │   ├── ds_config.json
    │   ├── mnist.yaml
    │   ├── mnist_grad_accum.yaml
    │   └── cifar10_zero2.yaml
    ├── pipeline_parallelism
    │   ├── ds_config.json
    │   ├── README.md
    │   ├── distributed.yaml
    │   └── alexnet.py
    ├── cifar10_cpu_offloading
    │   ├── ds_config_no_offload.json
    │   ├── zero_3_cpu_offload.yaml
    │   ├── zero_no_offload.yaml
    │   └── ds_config_offload.json
    └── cifar10_moe
    │   ├── ds_config.json
    │   ├── moe.yaml
    │   ├── zero_stages.yaml
    │   └── README.md
├── .github
    └── workflows
    │   └── check_markdown_links.yaml
├── fsdp
    └── minimal_fsdp
    │   └── config.yaml
└── .gitignore


/blog/llm-finetuning/.detignore:
--------------------------------------------------------------------------------
1 | text-to-sql*
2 | checkpoints


--------------------------------------------------------------------------------
/blog/llm-finetuning-2/.detignore:
--------------------------------------------------------------------------------
1 | text-to-sql*
2 | checkpoints


--------------------------------------------------------------------------------
/blog/llm-finetuning-3/.detignore:
--------------------------------------------------------------------------------
1 | text-to-sql*
2 | checkpoints


--------------------------------------------------------------------------------
/blog/lora-parameters/.detignore:
--------------------------------------------------------------------------------
1 | text-to-sql*
2 | checkpoints


--------------------------------------------------------------------------------
/computer_vision/detectron2_coco_pytorch/.detignore:
--------------------------------------------------------------------------------
1 | Dockerfile
2 | 


--------------------------------------------------------------------------------
/computer_vision/iris_tf_keras/startup-hook.sh:
--------------------------------------------------------------------------------
1 | pip install pandas
2 | 


--------------------------------------------------------------------------------
/gan/pix2pix_tf_keras/.gitignore:
--------------------------------------------------------------------------------
1 | logs/
2 | *.png
3 | checkpoints/
4 | 


--------------------------------------------------------------------------------
/blog/python_sdk_demo/mednist_model/requirements.txt:
--------------------------------------------------------------------------------
1 | medmnist
2 | wget
3 | 


--------------------------------------------------------------------------------
/model_hub/mmdetection/hydra/configs/profiling/disabled.yaml:
--------------------------------------------------------------------------------
1 | enabled: false
2 | 


--------------------------------------------------------------------------------
/features/ports_flask/startup-hook.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | pip install flask
4 | 


--------------------------------------------------------------------------------
/gan/pix2pix_tf_keras/.detignore:
--------------------------------------------------------------------------------
1 | checkpoints/
2 | images/
3 | *.png
4 | *.jpeg
5 | 


--------------------------------------------------------------------------------
/hp_search_benchmarks/darts_cifar10_pytorch/startup-hook.sh:
--------------------------------------------------------------------------------
1 | pip install attrdict
2 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .DS_STORE
3 | text-to-sql*
4 | checkpoints


--------------------------------------------------------------------------------
/blog/python_sdk_demo/requirements.txt:
--------------------------------------------------------------------------------
1 | determined>=0.26.4
2 | medmnist
3 | PyYAML
4 | 


--------------------------------------------------------------------------------
/hp_search_benchmarks/darts_penntreebank_pytorch/startup-hook.sh:
--------------------------------------------------------------------------------
1 | pip install wget
2 | 


--------------------------------------------------------------------------------
/model_hub/mmdetection/hydra/configs/data/disk.yaml:
--------------------------------------------------------------------------------
1 | file_client_args:
2 |   backend: disk
3 | 


--------------------------------------------------------------------------------
/model_hub/mmdetection/hydra/configs/data/fake.yaml:
--------------------------------------------------------------------------------
1 | file_client_args:
2 |   backend: fake
3 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning-2/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .DS_STORE
3 | text-to-sql*
4 | checkpoints
5 | *.png


--------------------------------------------------------------------------------
/blog/llm-finetuning-3/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .DS_STORE
3 | text-to-sql*
4 | checkpoints
5 | *.png


--------------------------------------------------------------------------------
/blog/lora-parameters/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .DS_STORE
3 | text-to-sql*
4 | checkpoints
5 | *.png


--------------------------------------------------------------------------------
/blog/tp/matmul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/blog/tp/matmul.png


--------------------------------------------------------------------------------
/blog/tp/mlp_tp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/blog/tp/mlp_tp.png


--------------------------------------------------------------------------------
/blog/README.md:
--------------------------------------------------------------------------------
1 | This directory hosts example code used in the [Determined AI blog](https://www.determined.ai/blog).


--------------------------------------------------------------------------------
/blog/lora-parameters/startup-hook.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | pip install --upgrade pip
3 | pip install -r requirements.txt


--------------------------------------------------------------------------------
/features/torch_batch_process_embeddings/startup-hook.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | pip install -r requirements.txt
4 | 


--------------------------------------------------------------------------------
/model_hub/mmdetection/hydra/configs/data/gcs.yaml:
--------------------------------------------------------------------------------
1 | file_client_args:
2 |   backend: gcs
3 |   bucket_name: ???
4 | 


--------------------------------------------------------------------------------
/model_hub/mmdetection/hydra/configs/data/s3.yaml:
--------------------------------------------------------------------------------
1 | file_client_args:
2 |   backend: s3
3 |   bucket_name: ???
4 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning-3/startup-hook.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | pip install --upgrade pip
3 | pip install -r requirements.txt


--------------------------------------------------------------------------------
/blog/llm-finetuning/startup-hook.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | pip install --upgrade pip
3 | pip install -r requirements.txt
4 | 


--------------------------------------------------------------------------------
/model_hub/mmdetection/hydra/configs/hyperparameters/fp16.yaml:
--------------------------------------------------------------------------------
1 | override_mmdet_config:
2 |   fp16.loss_scale: 512.
3 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning-2/startup-hook.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | pip install --upgrade pip
3 | pip install -r requirements.txt
4 | 


--------------------------------------------------------------------------------
/meta_learning/protonet_omniglot_pytorch/startup-hook.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | apt-get install unzip
3 | 
4 | ./fetch_data.sh
5 | 


--------------------------------------------------------------------------------
/model_hub/mmdetection/hydra/configs/profiling/enabled.yaml:
--------------------------------------------------------------------------------
1 | enabled: true
2 | begin_on_batch: ???
3 | end_after_batch: ???
4 | 
5 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.36.2
2 | datasets==2.16.1
3 | evaluate==0.4.1
4 | trl==0.7.9
5 | scikit-learn==1.4.0


--------------------------------------------------------------------------------
/nas/gaea_pytorch/eval/top5_val.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/nas/gaea_pytorch/eval/top5_val.png


--------------------------------------------------------------------------------
/nlp/bert_glue_pytorch/startup-hook.sh:
--------------------------------------------------------------------------------
1 | pip install transformers==2.8.0 scikit-learn==0.22.2.post1
2 | pip install sentencepiece==0.1.91
3 | 


--------------------------------------------------------------------------------
/model_hub/mmdetection/fasterrcnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/mmdetection/fasterrcnn.png


--------------------------------------------------------------------------------
/model_hub/mmdetection/hydra/configs/hyperparameters/ann_file.yaml:
--------------------------------------------------------------------------------
1 | override_mmdet_config:
2 |   data.train.ann_file: ???
3 |   data.val.ann_file: ???
4 | 


--------------------------------------------------------------------------------
/model_hub/mmdetection/hydra/configs/hyperparameters/base.yaml:
--------------------------------------------------------------------------------
1 | global_batch_size: 16
2 | config_file: ???
3 | merge_config: null
4 | use_pretrained: false
5 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning-3/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.39.1
2 | datasets==2.17.0
3 | evaluate==0.4.1
4 | trl==0.8.1
5 | scikit-learn==1.4.0
6 | deepspeed==0.10.2


--------------------------------------------------------------------------------
/features/torch_batch_process_embeddings/requirements.txt:
--------------------------------------------------------------------------------
1 | accelerate>=0.12.0
2 | transformers>=4.28.1,<4.29.0
3 | tokenizers>=0.13.3
4 | datasets
5 | chromadb
6 | 


--------------------------------------------------------------------------------
/gan/pix2pix_tf_keras/images/batches_vs_time.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/gan/pix2pix_tf_keras/images/batches_vs_time.jpg


--------------------------------------------------------------------------------
/model_hub/mmdetection/hydra/configs/searcher/single.yaml:
--------------------------------------------------------------------------------
1 | name: single
2 | metric: bbox_mAP
3 | max_length:
4 |   batches: 87850
5 | smaller_is_better: false
6 | 


--------------------------------------------------------------------------------
/nlp/word_language_model/validation_loss_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/nlp/word_language_model/validation_loss_table.png


--------------------------------------------------------------------------------
/gan/pix2pix_tf_keras/images/generated_example.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/gan/pix2pix_tf_keras/images/generated_example.jpeg


--------------------------------------------------------------------------------
/blog/llm-finetuning-2/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.37.2
2 | datasets==2.17.0
3 | evaluate==0.4.1
4 | trl==0.7.10
5 | scikit-learn==1.4.0
6 | deepspeed==0.10.2
7 | peft==0.8.2


--------------------------------------------------------------------------------
/computer_vision/detr_coco_pytorch/imgs/val_curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/detr_coco_pytorch/imgs/val_curves.png


--------------------------------------------------------------------------------
/computer_vision/efficientdet_pytorch/loss_by_gpus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/efficientdet_pytorch/loss_by_gpus.png


--------------------------------------------------------------------------------
/computer_vision/unets_tf_keras/Cumulative_Batches.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/unets_tf_keras/Cumulative_Batches.png


--------------------------------------------------------------------------------
/computer_vision/unets_tf_keras/Validation_Accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/unets_tf_keras/Validation_Accuracy.png


--------------------------------------------------------------------------------
/gan/dcgan_tf_keras/images/dcgan_inference_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/gan/dcgan_tf_keras/images/dcgan_inference_example.png


--------------------------------------------------------------------------------
/gan/pix2pix_tf_keras/images/training_loss_vs_time.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/gan/pix2pix_tf_keras/images/training_loss_vs_time.jpg


--------------------------------------------------------------------------------
/model_hub/huggingface/multiple-choice/figures/swag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/multiple-choice/figures/swag.png


--------------------------------------------------------------------------------
/computer_vision/cifar10_pytorch_inference/startup-hook.sh:
--------------------------------------------------------------------------------
1 | pip install gdown
2 | gdown https://drive.google.com/uc?id=1JTchzEFqtjbAVWXyNa5BkYPi12_CoHlS -O state_dicts/resnet18.pt
3 | 


--------------------------------------------------------------------------------
/computer_vision/detr_coco_pytorch/imgs/train_curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/detr_coco_pytorch/imgs/train_curves.png


--------------------------------------------------------------------------------
/computer_vision/efficientdet_pytorch/Samples_per_sec.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/efficientdet_pytorch/Samples_per_sec.png


--------------------------------------------------------------------------------
/gan/pix2pix_tf_keras/images/validation_loss_vs_time.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/gan/pix2pix_tf_keras/images/validation_loss_vs_time.jpg


--------------------------------------------------------------------------------
/model_hub/huggingface/language-modeling/figures/clm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/language-modeling/figures/clm.png


--------------------------------------------------------------------------------
/model_hub/huggingface/language-modeling/figures/mlm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/language-modeling/figures/mlm.png


--------------------------------------------------------------------------------
/model_hub/huggingface/language-modeling/figures/plm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/language-modeling/figures/plm.png


--------------------------------------------------------------------------------
/computer_vision/detectron2_coco_pytorch/metrics_by_time.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/detectron2_coco_pytorch/metrics_by_time.png


--------------------------------------------------------------------------------
/meta_learning/protonet_omniglot_pytorch/omniglot_20w1s.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/meta_learning/protonet_omniglot_pytorch/omniglot_20w1s.png


--------------------------------------------------------------------------------
/model_hub/huggingface/question-answering/figures/squad.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/question-answering/figures/squad.png


--------------------------------------------------------------------------------
/model_hub/huggingface/text-classification/figures/glue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/text-classification/figures/glue.png


--------------------------------------------------------------------------------
/model_hub/huggingface/text-classification/figures/xnli.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/text-classification/figures/xnli.png


--------------------------------------------------------------------------------
/model_hub/huggingface/token-classification/figures/ner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/token-classification/figures/ner.png


--------------------------------------------------------------------------------
/computer_vision/detr_coco_pytorch/imgs/detr_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/detr_coco_pytorch/imgs/detr_architecture.png


--------------------------------------------------------------------------------
/model_hub/huggingface/question-answering/figures/squad_v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/question-answering/figures/squad_v2.png


--------------------------------------------------------------------------------
/model_hub/mmdetection/hydra/configs/hyperparameters/tune_optimizer.yaml:
--------------------------------------------------------------------------------
1 | override_mmdet_config:
2 |   optimizer.lr:
3 |     type: log
4 |     base: 10
5 |     minval: -3
6 |     maxval: -1
7 | 


--------------------------------------------------------------------------------
/blog/lora-parameters/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.37.2
2 | datasets==2.17.0
3 | evaluate==0.4.1
4 | trl==0.7.10
5 | scikit-learn==1.4.0
6 | deepspeed==0.10.2
7 | peft==0.8.2
8 | huggingface_hub


--------------------------------------------------------------------------------
/computer_vision/deformabledetr_coco_pytorch/imgs/val_curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/deformabledetr_coco_pytorch/imgs/val_curves.png


--------------------------------------------------------------------------------
/computer_vision/deformabledetr_coco_pytorch/imgs/train_curves.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/computer_vision/deformabledetr_coco_pytorch/imgs/train_curves.png


--------------------------------------------------------------------------------
/features/torch_batch_process_core_api_comparison/constants.py:
--------------------------------------------------------------------------------
1 | DATA_DIRECTORY = "/tmp/data/cifar10"
2 | LOCK_FILE = "/tmp/data/cifar10/cifar10.lock"
3 | PREDICTIONS_DIRECTORY = "/tmp/inference_out/"
4 | 


--------------------------------------------------------------------------------
/model_hub/huggingface/question-answering/figures/squad_v2_albert.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/question-answering/figures/squad_v2_albert.png


--------------------------------------------------------------------------------
/model_hub/huggingface/question-answering/figures/squad_beam_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/question-answering/figures/squad_beam_search.png


--------------------------------------------------------------------------------
/model_hub/huggingface/question-answering/figures/squad_distributed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/question-answering/figures/squad_distributed.png


--------------------------------------------------------------------------------
/model_hub/huggingface/question-answering/figures/squad_v2_beam_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/model_hub/huggingface/question-answering/figures/squad_v2_beam_search.png


--------------------------------------------------------------------------------
/hp_search_benchmarks/darts_cifar10_pytorch/figures/constrained_adaptive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/determined-ai/determined-examples/HEAD/hp_search_benchmarks/darts_cifar10_pytorch/figures/constrained_adaptive.png


--------------------------------------------------------------------------------
/computer_vision/detectron2_coco_pytorch/Makefile:
--------------------------------------------------------------------------------
1 | TAG := determinedai/example-detectron2:0.6-cuda-10.2-pytorch-1.10
2 | 
3 | .PHONY: build
4 | build:
5 | 	docker build -f Dockerfile -t $(TAG) . && \
6 | 	docker push $(TAG)
7 | 


--------------------------------------------------------------------------------
/graphs/proteins_pytorch_geometric/startup-hook.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | pip install torch_geometric==2.2.0
4 | pip install torch_sparse==0.6.16 torch_scatter==2.1.0 -f https://pytorch-geometric.com/whl/torch-1.12.0+cu113.html
5 | 


--------------------------------------------------------------------------------
/model_hub/mmdetection/hydra/configs/hyperparameters/grad_clip.yaml:
--------------------------------------------------------------------------------
1 | override_mmdet_config:
2 |   optimizer_config._delete_: true
3 |   optimizer_config.grad_clip.max_norm: ???
4 |   optimizer_config.grad_clip.norm_type: ???
5 | 


--------------------------------------------------------------------------------
/model_hub/mmdetection/hydra/configs/searcher/adaptive.yaml:
--------------------------------------------------------------------------------
1 | name: adaptive_asha
2 | metric: bbox_mAP
3 | max_length:
4 |   batches: 87850
5 | max_trials: 100
6 | mode: aggressive
7 | max_rungs: 4
8 | smaller_is_better: false
9 | 


--------------------------------------------------------------------------------
/computer_vision/unets_tf_keras/startup-hook.sh:
--------------------------------------------------------------------------------
1 | pip install "setuptools<66" # necessary for installing tensorflow/examples for some reason
2 | pip install git+https://github.com/tensorflow/examples.git
3 | pip install -q -U tfds-nightly
4 | 


--------------------------------------------------------------------------------
/features/ports_flask/hello-client.yaml:
--------------------------------------------------------------------------------
 1 | name: hello-client
 2 | entrypoint: python3 hello-client.py
 3 | resources:
 4 |   slots: 0
 5 | max_restarts: 0
 6 | 
 7 | searcher:
 8 |    name: single
 9 |    metric: x
10 |    max_length: 10000000
11 | 


--------------------------------------------------------------------------------
/blog/act-mem-2/requirements.txt:
--------------------------------------------------------------------------------
 1 | einops==0.8.0
 2 | filelock==3.14.0
 3 | fsspec==2024.5.0
 4 | iniconfig==2.0.0
 5 | Jinja2==3.1.4
 6 | MarkupSafe==2.1.5
 7 | mpmath==1.3.0
 8 | networkx==3.3
 9 | packaging==24.0
10 | pluggy==1.5.0
11 | pytest==8.2.1
12 | sympy==1.12.1
13 | torch==2.3.0
14 | typing_extensions==4.12.0
15 | 


--------------------------------------------------------------------------------
/blog/python_sdk_demo/mednist_model/startup-hook.sh:
--------------------------------------------------------------------------------
1 | # This file is executed as the container is started up that this model will be
2 | # run on.
3 | #
4 | # For more information, see
5 | #   https://docs.determined.ai/latest/model-dev-guide/prepare-container/custom-env.html#startup-hook 
6 | 
7 | pip install -r requirements.txt
8 | 


--------------------------------------------------------------------------------
/custom_search_method/asha_search_method/remote_search_runner/searcher.yaml:
--------------------------------------------------------------------------------
 1 | name: remote-search-runner
 2 | entrypoint: python3 remote_search_runner/run_experiment.py
 3 | searcher:
 4 |   metric: validation_error
 5 |   smaller_is_better: true
 6 |   name: single
 7 |   max_length:
 8 |     batches: 1000
 9 | max_restarts: 0
10 | 


--------------------------------------------------------------------------------
/computer_vision/byol_pytorch/startup-hook.sh:
--------------------------------------------------------------------------------
1 | # Copy LARS implementation from upstream repo.
2 | git clone https://github.com/untitled-ai/self_supervised.git
3 | (cd self_supervised && git checkout 6d14ca0402ecc13feda9b3a9fdc056fd1ac24473)
4 | cp self_supervised/lars.py ./
5 | python3 -m pip install attrdict byol-pytorch filelock
6 | 


--------------------------------------------------------------------------------
/nlp/albert_squad_pytorch/startup-hook.sh:
--------------------------------------------------------------------------------
1 | # Very important to pin sentencepiece as the newer version causes segementation faults (as of Oct 2020)
2 | pip install sentencepiece==0.1.91
3 | pip install transformers==3.1.0
4 | pip install -e git+git://github.com/LiyuanLucasLiu/RAdam.git@baf4f65445c00d686d4098841b3ca1f62a886326#egg=radam
5 | 


--------------------------------------------------------------------------------
/tutorials/fashion_mnist_tf_keras/const.yaml:
--------------------------------------------------------------------------------
 1 | name: fashion_mnist_tf_keras_const
 2 | hyperparameters:
 3 |   global_batch_size: 32
 4 |   dense1: 128
 5 | records_per_epoch: 60000
 6 | searcher:
 7 |   name: single
 8 |   metric: val_accuracy
 9 |   smaller_is_better: false
10 |   max_length:
11 |     epochs: 5
12 | entrypoint: model_def:FashionMNISTTrial
13 | 


--------------------------------------------------------------------------------
/computer_vision/cifar10_pytorch_inference/const.yaml:
--------------------------------------------------------------------------------
 1 | description: cifar10_pytorch_inference_const
 2 | hyperparameters:
 3 |   global_batch_size: 8
 4 | records_per_epoch: 50000
 5 | searcher:
 6 |   name: single
 7 |   metric: validation_error
 8 |   max_length:
 9 |     epochs: 1
10 | entrypoint: model_def:CIFARTrial
11 | min_validation_period:
12 |   epochs: 1
13 | 


--------------------------------------------------------------------------------
/custom_search_method/asha_search_method/experiment_files/config.yaml:
--------------------------------------------------------------------------------
 1 | name: mnist-custom-search-experiment
 2 | data:
 3 |   url: https://s3-us-west-2.amazonaws.com/determined-ai-test-data/pytorch_mnist.tar.gz
 4 | searcher:
 5 |   name: custom
 6 |   metric: validation_loss
 7 |   smaller_is_better: true
 8 |   unit: batches
 9 | entrypoint: model_def:MNistTrial
10 | 


--------------------------------------------------------------------------------
/features/ports_flask/hello-server.yaml:
--------------------------------------------------------------------------------
 1 | name: hello-server
 2 | entrypoint: python3 hello-server.py
 3 | resources:
 4 |   slots: 0
 5 | max_restarts: 0
 6 | environment:
 7 |   proxy_ports:
 8 |     - proxy_port: 5000
 9 |       proxy_tcp: true
10 |       unauthenticated: true
11 | 
12 | searcher:
13 |    name: single
14 |    metric: x
15 |    max_length: 10000000
16 | 


--------------------------------------------------------------------------------
/gan/dcgan_tf_keras/const.yaml:
--------------------------------------------------------------------------------
 1 | name: dc_gan
 2 | hyperparameters:
 3 |   noise_dim: 128
 4 |   global_batch_size: 256
 5 |   discriminator_lr: 0.0001
 6 |   generator_lr: 0.0001
 7 | records_per_epoch: 50000
 8 | searcher:
 9 |   name: single
10 |   metric: "val_d_loss"
11 |   smaller_is_better: true
12 |   max_length:
13 |     epochs: 50
14 | entrypoint: model_def:DCGanTrial
15 | 


--------------------------------------------------------------------------------
/deepspeed/deepspeed_dcgan/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "train_batch_size": 64,
 3 |     "optimizer": {
 4 |         "type": "Adam",
 5 |         "params": {
 6 |             "lr": 0.0002,
 7 |             "betas": [
 8 |                 0.5,
 9 |                 0.999
10 |             ],
11 |             "eps": 1e-8
12 |         }
13 |     },
14 |     "steps_per_print": 10
15 | }
16 | 


--------------------------------------------------------------------------------
/.github/workflows/check_markdown_links.yaml:
--------------------------------------------------------------------------------
 1 | name: Check Markdown links
 2 | on:
 3 |   pull_request:
 4 |     branches:
 5 |       - main
 6 | 
 7 | jobs:
 8 |   markdown-link-check:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@master
12 |     - uses: gaurav-nelson/github-action-markdown-link-check@v1
13 |       with:
14 |         use-quiet-mode: 'yes'
15 | 


--------------------------------------------------------------------------------
/features/ports_flask/hello-server.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, jsonify, request
 2 | 
 3 | app = Flask(__name__)
 4 | 
 5 | 
 6 | @app.route("/hello", methods=["GET"])
 7 | def hello():
 8 |     if request.method == "GET":
 9 |         data = {"data": "Hello World"}
10 |         return jsonify(data)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     app.run(host="0.0.0.0", port=5000, debug=True)
15 | 


--------------------------------------------------------------------------------
/hp_search_benchmarks/darts_cifar10_pytorch/genotypes.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | 
 3 | Genotype = namedtuple("Genotype", "normal normal_concat reduce reduce_concat")
 4 | 
 5 | PRIMITIVES = [
 6 |     "none",
 7 |     "max_pool_3x3",
 8 |     "avg_pool_3x3",
 9 |     "skip_connect",
10 |     "sep_conv_3x3",
11 |     "sep_conv_5x5",
12 |     "dil_conv_3x3",
13 |     "dil_conv_5x5",
14 | ]
15 | 


--------------------------------------------------------------------------------
/tutorials/fashion_mnist_tf_keras/distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: fashion_mnist_tf_keras_distributed
 2 | hyperparameters:
 3 |   global_batch_size: 256
 4 |   dense1: 128
 5 | resources:
 6 |   slots_per_trial: 8
 7 | records_per_epoch: 60000
 8 | searcher:
 9 |   name: single
10 |   metric: val_accuracy
11 |   smaller_is_better: false
12 |   max_length:
13 |     epochs: 5
14 | entrypoint: model_def:FashionMNISTTrial
15 | 


--------------------------------------------------------------------------------
/computer_vision/efficientdet_pytorch/startup-hook.sh:
--------------------------------------------------------------------------------
 1 | pip install timm==0.3.1
 2 | # pycocotools 2.0.5, a dependency of efficientdet-pytorch,
 3 | # would not install without cython
 4 | pip install pycocotools==2.0.4
 5 | git clone https://github.com/rwightman/efficientdet-pytorch.git
 6 | cd efficientdet-pytorch
 7 | git checkout 611532db49fdd691f48f913bc433391a12014bd8
 8 | python setup.py install
 9 | cd ..
10 | 


--------------------------------------------------------------------------------
/deepspeed/pipeline_parallelism/ds_config.json:
--------------------------------------------------------------------------------
 1 |  {
 2 |   "train_batch_size" : 256,
 3 |   "train_micro_batch_size_per_gpu" : 8,
 4 | 
 5 |    "optimizer": {
 6 |     "type": "Adam",
 7 |     "params": {
 8 |       "lr": 0.001,
 9 |       "betas": [
10 |         0.9,
11 |         0.999
12 |       ],
13 |       "eps": 1e-8
14 |     }
15 |   },
16 |   
17 |   "steps_per_print" : 100,
18 |   "wall_clock_breakdown" : false
19 |  }
20 | 


--------------------------------------------------------------------------------
/gan/dcgan_tf_keras/distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: dc_gan
 2 | hyperparameters:
 3 |   noise_dim: 128
 4 |   global_batch_size: 1024
 5 |   discriminator_lr: 0.00003
 6 |   generator_lr: 0.00003
 7 | records_per_epoch: 50000
 8 | searcher:
 9 |   name: single
10 |   metric: "val_d_loss"
11 |   smaller_is_better: true
12 |   max_length:
13 |     epochs: 50
14 | entrypoint: model_def:DCGanTrial
15 | resources:
16 |   slots_per_trial: 4
17 | 


--------------------------------------------------------------------------------
/features/torch_batch_process_embeddings/distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: bert_embedding_generation
 2 | entrypoint: >-
 3 |    python3 -m determined.launch.torch_distributed
 4 |    python3 bert_embedding_generation.py
 5 | 
 6 | resources:
 7 |   slots_per_trial: 2
 8 | 
 9 | searcher:
10 |    name: single
11 |    metric: x
12 |    max_length: 100
13 | 
14 | max_restarts: 0
15 | bind_mounts:
16 |   - host_path: /tmp
17 |     container_path: /tmp
18 | 


--------------------------------------------------------------------------------
/gan/gan_mnist_pytorch/const.yaml:
--------------------------------------------------------------------------------
 1 | name: gan_mnist_pytorch_const
 2 | data:
 3 |   url: "https://s3-us-west-2.amazonaws.com/determined-ai-test-data/pytorch_mnist.tar.gz"
 4 | hyperparameters:
 5 |   global_batch_size: 32
 6 |   lr: 0.0002
 7 |   b1: 0.5
 8 |   b2: 0.999
 9 |   latent_dim: 100
10 | searcher:
11 |   name: single
12 |   metric: loss
13 |   max_length:
14 |     batches: 40000
15 |   smaller_is_better: True
16 | entrypoint: model_def:GANTrial
17 | 


--------------------------------------------------------------------------------
/tutorials/fashion_mnist_tf_keras/adaptive.yaml:
--------------------------------------------------------------------------------
 1 | name: fashion_mnist_tf_keras_adaptive_search
 2 | hyperparameters:
 3 |   global_batch_size: 32
 4 |   dense1:
 5 |     type: int
 6 |     minval: 32
 7 |     maxval: 256
 8 | records_per_epoch: 60000
 9 | searcher:
10 |   name: adaptive_asha
11 |   metric: val_accuracy
12 |   smaller_is_better: false
13 |   max_length:
14 |     epochs: 5
15 |   max_trials: 10
16 | entrypoint: model_def:FashionMNISTTrial
17 | 


--------------------------------------------------------------------------------
/features/torch_batch_process_core_api_comparison/core_api_config.yaml:
--------------------------------------------------------------------------------
 1 | name: core_api_batch_inference
 2 | entrypoint: >-
 3 |    python3 -m determined.launch.torch_distributed
 4 |    python3 core_api_inference.py
 5 | 
 6 | resources:
 7 |   slots_per_trial: 2
 8 | 
 9 | searcher:
10 |    name: single
11 |    metric: x
12 |    max_length: 100
13 | max_restarts: 2
14 | bind_mounts:
15 |     - host_path: /tmp
16 |       container_path: /tmp
17 |       read_only: false
18 | 


--------------------------------------------------------------------------------
/nlp/albert_squad_pytorch/constants.py:
--------------------------------------------------------------------------------
 1 | from transformers import (
 2 |     AlbertConfig,
 3 |     AlbertForQuestionAnswering,
 4 |     AlbertTokenizer,
 5 |     BertConfig,
 6 |     BertForQuestionAnswering,
 7 |     BertTokenizer,
 8 | )
 9 | 
10 | MODEL_CLASSES = {
11 |     "bert": (BertConfig, BertTokenizer, BertForQuestionAnswering),
12 |     "albert": (
13 |         AlbertConfig,
14 |         AlbertTokenizer,
15 |         AlbertForQuestionAnswering,
16 |     ),
17 | }
18 | 


--------------------------------------------------------------------------------
/blog/tp/README.md:
--------------------------------------------------------------------------------
 1 | # Tensor Parallelism
 2 | 
 3 | Code accompanying the deep-dive [blog post on Tensor Parallelism](https://determined.ai/blog/tp).
 4 | 
 5 | - The MLP and TP MLP layers are in `layer.py`
 6 | - Matmul profiling code in `matmul_profiling.py`
 7 | - MLP TP profiling code in `tp_profiling.py`
 8 | - Tests of the rearranging tensor sums are in `test_dot_product_{local,distributed}.py`
 9 | 
10 | 
11 | ## Contributors
12 | 
13 | - [Garrett Goon](https://github.com/garrett361)


--------------------------------------------------------------------------------
/features/torch_batch_process_core_api_comparison/torch_batch_process_config.yaml:
--------------------------------------------------------------------------------
 1 | name: torch_batch_process_batch_inference
 2 | entrypoint: >-
 3 |    python3 -m determined.launch.torch_distributed
 4 |    python3 torch_batch_process_inference.py
 5 | 
 6 | resources:
 7 |   slots_per_trial: 2
 8 | 
 9 | searcher:
10 |    name: single
11 |    metric: x
12 |    max_length: 100
13 | 
14 | max_restarts: 2
15 | bind_mounts:
16 |     - host_path: /tmp
17 |       container_path: /tmp
18 |       read_only: false
19 | 


--------------------------------------------------------------------------------
/computer_vision/fasterrcnn_coco_pytorch/const.yaml:
--------------------------------------------------------------------------------
 1 | name: fasterrcnn_coco_pytorch_const
 2 | data:
 3 |   url: https://determined-ai-public-datasets.s3-us-west-2.amazonaws.com/PennFudanPed/PennFudanPed.zip
 4 | hyperparameters:
 5 |   learning_rate: 0.005
 6 |   momentum: 0.9
 7 |   weight_decay: 0.0005
 8 |   global_batch_size: 2
 9 | searcher:
10 |   name: single
11 |   metric: val_avg_iou
12 |   smaller_is_better: false
13 |   max_length:
14 |     batches: 800
15 | entrypoint: model_def:ObjectDetectionTrial
16 | 


--------------------------------------------------------------------------------
/gan/pix2pix_tf_keras/print_models.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from pix2pix import make_discriminator_model, make_generator_model
 3 | 
 4 | 
 5 | def main():
 6 |     generator = make_generator_model()
 7 |     tf.keras.utils.plot_model(generator, show_shapes=True, dpi=64, to_file="generator.png")
 8 |     discriminator = make_discriminator_model()
 9 |     tf.keras.utils.plot_model(discriminator, show_shapes=True, dpi=64, to_file="discriminator.png")
10 | 
11 | 
12 | if __name__ == "__main__":
13 |     main()
14 | 


--------------------------------------------------------------------------------
/gan/gan_mnist_pytorch/distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: gan_mnist_pytorch_distributed 
 2 | data:
 3 |   url: "https://s3-us-west-2.amazonaws.com/determined-ai-test-data/pytorch_mnist.tar.gz"
 4 | hyperparameters:
 5 |   global_batch_size: 256 # per GPU batch size of 32
 6 |   lr: 0.0002
 7 |   b1: 0.5
 8 |   b2: 0.999
 9 |   latent_dim: 100
10 | searcher:
11 |   name: single
12 |   metric: loss
13 |   max_length:
14 |     batches: 5000
15 |   smaller_is_better: True
16 | entrypoint: model_def:GANTrial
17 | resources:
18 |   slots_per_trial: 8
19 | 


--------------------------------------------------------------------------------
/computer_vision/iris_tf_keras/const.yaml:
--------------------------------------------------------------------------------
 1 | name: iris_tf_keras_const
 2 | data:
 3 |   train_url: http://download.tensorflow.org/data/iris_training.csv
 4 |   test_url: http://download.tensorflow.org/data/iris_test.csv
 5 | hyperparameters:
 6 |   learning_rate: 1.0e-4
 7 |   learning_rate_decay: 1.0e-6
 8 |   layer1_dense_size: 16
 9 |   global_batch_size: 30
10 | searcher:
11 |   name: single
12 |   metric: val_categorical_accuracy
13 |   smaller_is_better: false
14 |   max_length:
15 |     batches: 5000
16 | entrypoint: model_def:IrisTrial
17 | 


--------------------------------------------------------------------------------
/features/checkpoint_hooks_pytorch/const.yaml:
--------------------------------------------------------------------------------
 1 | name: mnist_pytorch_const
 2 | data:
 3 |   url: https://s3-us-west-2.amazonaws.com/determined-ai-test-data/pytorch_mnist.tar.gz
 4 | hyperparameters:
 5 |   learning_rate: 1.0
 6 |   global_batch_size: 64
 7 |   n_filters1: 32
 8 |   n_filters2: 64
 9 |   dropout1: 0.25
10 |   dropout2: 0.5
11 | searcher:
12 |   name: single
13 |   metric: validation_loss
14 |   max_length:
15 |       batches: 937 #60,000 training images with batch size 64
16 |   smaller_is_better: true
17 | entrypoint: model_def:MNistTrial
18 | 


--------------------------------------------------------------------------------
/computer_vision/cifar10_pytorch/const.yaml:
--------------------------------------------------------------------------------
 1 | name: cifar10_pytorch_const
 2 | description: An example experiment using Determined AI with CIFAR10 and PyTorch.
 3 | hyperparameters:
 4 |   learning_rate: 1.0e-4
 5 |   learning_rate_decay: 1.0e-6
 6 |   layer1_dropout: 0.25
 7 |   layer2_dropout: 0.25
 8 |   layer3_dropout: 0.5
 9 |   global_batch_size: 32
10 | records_per_epoch: 50000
11 | searcher:
12 |   name: single
13 |   metric: validation_error
14 |   max_length:
15 |     epochs: 32
16 | entrypoint: model_def:CIFARTrial
17 | min_validation_period:
18 |   epochs: 1


--------------------------------------------------------------------------------
/blog/tp/matmul_profiling.yaml:
--------------------------------------------------------------------------------
 1 | name: Matmul Profiling
 2 | # Adjust the workspace and project names, as appropriate.
 3 | workspace: TP Blog Post
 4 | project: Matmul Profiling
 5 | resources:
 6 |   slots_per_trial: 1
 7 | searcher:
 8 |   name: single
 9 |   metric: not_used
10 |   max_length: 1
11 | hyperparameters:
12 |   d_model_min: 256
13 |   d_model_max: 16384
14 |   d_model_step: 256
15 |   num_warmups: 5
16 |   num_repeats: 100
17 | entrypoint: >-
18 |   python3 -m determined.launch.torch_distributed
19 |   python3 matmul_profiling.py
20 | max_restarts: 0
21 | 


--------------------------------------------------------------------------------
/computer_vision/cifar10_tf_keras/const.yaml:
--------------------------------------------------------------------------------
 1 | name: cifar10_tf_keras_const
 2 | hyperparameters:
 3 |   learning_rate: 1.0e-4
 4 |   learning_rate_decay: 1.0e-6
 5 |   layer1_dropout: 0.25
 6 |   layer2_dropout: 0.25
 7 |   layer3_dropout: 0.5
 8 |   global_batch_size: 40
 9 |   width_factor: 0.1
10 |   height_factor: 0.1
11 |   horizontal_flip: True
12 | records_per_epoch: 50000
13 | searcher:
14 |   name: single
15 |   metric: val_categorical_error
16 |   max_length:
17 |     epochs: 32
18 | min_validation_period:
19 |   epochs: 1
20 | entrypoint: model_def:CIFARTrial
21 | 


--------------------------------------------------------------------------------
/blog/tp/tp_profiling.yaml:
--------------------------------------------------------------------------------
 1 | name: MLP TP Profiling
 2 | # Adjust the workspace and project names, as appropriate.
 3 | workspace: TP Blog Post
 4 | project: MLP TP Profiling
 5 | resources:
 6 |   slots_per_trial: 8
 7 | searcher:
 8 |   name: single
 9 |   metric: not_used
10 |   max_length: 1
11 | hyperparameters:
12 |   batch_size: 1
13 |   seq_len: 4096
14 |   d_model_min: 1024
15 |   d_model_max: 20480
16 |   d_model_step: 512
17 |   num_warmups: 5
18 |   num_repeats: 100
19 | entrypoint: >-
20 |   python3 -m determined.launch.torch_distributed
21 |   python3 tp_profiling.py
22 | max_restarts: 0
23 | 


--------------------------------------------------------------------------------
/graphs/proteins_pytorch_geometric/const.yaml:
--------------------------------------------------------------------------------
 1 | name: proteins_pytorch_geometric
 2 | hyperparameters:
 3 |   global_batch_size: 60
 4 |   dataset: PROTEINS
 5 |   lr: 0.0005
 6 |   topk_pooling_ratio: 0.8
 7 |   dropout: 0.5
 8 |   training_records: 890
 9 | records_per_epoch: 890
10 | min_validation_period:
11 |   epochs: 1
12 | searcher:
13 |   name: single
14 |   metric: validation_loss
15 |   max_length:
16 |     epochs: 200
17 |   smaller_is_better: true
18 | entrypoint: model_def:GraphConvTrial
19 | environment:
20 |   image:
21 |     cuda: determinedai/environments:cuda-11.3-pytorch-1.12-tf-2.11-gpu-2b7e2a1
22 | 


--------------------------------------------------------------------------------
/computer_vision/detectron2_coco_pytorch/const_fake.yaml:
--------------------------------------------------------------------------------
 1 | name: detectron2_const_e2e_tests
 2 | environment:
 3 |     image: "determinedai/example-detectron2:0.6-cuda-10.2-pytorch-1.10"
 4 |     environment_variables:
 5 |       - DETECTRON2_DATASETS=.
 6 | hyperparameters:
 7 |   global_batch_size: 1
 8 |   model_yaml: mask_rcnn_R_50_FPN_noaug_1x.yaml
 9 |   output_dir: None
10 |   fake_data: True
11 | searcher:
12 |   name: single
13 |   metric: bboxAP
14 |   max_length: 
15 |     batches: 100
16 |   smaller_is_better: false
17 | resources:
18 |     slots_per_trial: 1
19 | entrypoint: model_def:DetectronTrial
20 | max_restarts: 0
21 | 


--------------------------------------------------------------------------------
/computer_vision/iris_tf_keras/distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: iris_tf_keras_distributed
 2 | data:
 3 |   train_url: http://download.tensorflow.org/data/iris_training.csv
 4 |   test_url: http://download.tensorflow.org/data/iris_test.csv
 5 | hyperparameters:
 6 |   learning_rate: 1.0e-4
 7 |   learning_rate_decay: 1.0e-6
 8 |   layer1_dense_size: 16
 9 |   global_batch_size: 30
10 | resources:
11 |   slots_per_trial: 2 # Use 2 GPUs to train the model.
12 | searcher:
13 |   name: single
14 |   metric: val_categorical_accuracy
15 |   smaller_is_better: false
16 |   max_length:
17 |     batches: 2500
18 | entrypoint: model_def:IrisTrial
19 | 


--------------------------------------------------------------------------------
/features/custom_reducers_mnist_pytorch/const.yaml:
--------------------------------------------------------------------------------
 1 | name: custom_reducers_mnist_pytorch_const
 2 | data:
 3 |   url: https://s3-us-west-2.amazonaws.com/determined-ai-test-data/pytorch_mnist.tar.gz
 4 | hyperparameters:
 5 |   learning_rate: 1.0
 6 |   global_batch_size: 64
 7 |   n_filters1: 32
 8 |   n_filters2: 64
 9 |   dropout1: 0.25
10 |   dropout2: 0.5
11 | searcher:
12 |   name: single
13 |   metric: validation_loss
14 |   max_length:
15 |       batches: 937 #60,000 training images with batch size 64
16 |   smaller_is_better: true
17 | entrypoint: model_def:MNistTrial
18 | 
19 | # Show off validation metrics.
20 | min_validation_period:
21 |   batches: 100
22 | 


--------------------------------------------------------------------------------
/nlp/bert_glue_pytorch/const.yaml:
--------------------------------------------------------------------------------
 1 | name: bert_glue_pytorch_const
 2 | hyperparameters:
 3 |   global_batch_size: 24
 4 |   learning_rate: 2.0e-5
 5 |   lr_scheduler_epoch_freq: 1
 6 |   model_type: 'bert'
 7 |   adam_epsilon: 1.0e-8
 8 |   weight_decay: 0
 9 |   num_warmup_steps: 0
10 |   num_training_steps: 459
11 |   max_seq_length: 128
12 | searcher:
13 |   name: single
14 |   metric: acc
15 |   max_length:
16 |     batches: 400
17 |   smaller_is_better: false
18 | data:
19 |   task: 'MRPC'
20 |   model_name_or_path: "bert-base-uncased"
21 |   output_mode: "classification"
22 |   path_to_mrpc: ''
23 |   download_data: True
24 | entrypoint: model_def:BertPyTorch
25 | 


--------------------------------------------------------------------------------
/computer_vision/detectron2_coco_pytorch/mask_rcnn_R_50_FPN_noaug_1x.yaml:
--------------------------------------------------------------------------------
 1 | _BASE_: "Base-RCNN-FPN.yaml"
 2 | MODEL:
 3 |   WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl"
 4 |   MASK_ON: True
 5 |   RESNETS:
 6 |     DEPTH: 50
 7 |   # Detectron1 uses smooth L1 loss with some magic beta values.
 8 |   # The defaults are changed to L1 loss in Detectron2.
 9 |   RPN:
10 |     SMOOTH_L1_BETA: 0.1111
11 |   ROI_BOX_HEAD:
12 |     SMOOTH_L1_BETA: 1.0
13 |     POOLER_SAMPLING_RATIO: 2
14 |     POOLER_TYPE: "ROIAlign"
15 |   ROI_MASK_HEAD:
16 |     POOLER_SAMPLING_RATIO: 2
17 |     POOLER_TYPE: "ROIAlign"
18 | INPUT:
19 |   # no scale augmentation
20 |   MIN_SIZE_TRAIN: (800, )


--------------------------------------------------------------------------------
/computer_vision/fasterrcnn_coco_pytorch/adaptive.yaml:
--------------------------------------------------------------------------------
 1 | name: fasterrcnn_coco_pytorch_adaptive_search
 2 | data:
 3 |   url: https://determined-ai-public-datasets.s3-us-west-2.amazonaws.com/PennFudanPed/PennFudanPed.zip
 4 | hyperparameters:
 5 |   learning_rate:
 6 |     type: double
 7 |     minval: 0.0001
 8 |     maxval: 0.001
 9 |   momentum:
10 |     type: double
11 |     minval: 0.2
12 |     maxval: 1.0
13 |   weight_decay: 0.0005
14 |   global_batch_size: 2
15 | searcher:
16 |   name: adaptive_asha
17 |   metric: val_avg_iou
18 |   smaller_is_better: false
19 |   max_length:
20 |     batches: 800
21 |   max_trials: 16
22 | entrypoint: model_def:ObjectDetectionTrial
23 | 


--------------------------------------------------------------------------------
/graphs/proteins_pytorch_geometric/distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: proteins_pytorch_geometric_distributed
 2 | hyperparameters:
 3 |   global_batch_size: 60
 4 |   dataset: PROTEINS
 5 |   lr: 0.0005
 6 |   topk_pooling_ratio: 0.8
 7 |   dropout: 0.5
 8 |   training_records: 890
 9 | records_per_epoch: 890
10 | min_validation_period:
11 |   epochs: 1
12 | searcher:
13 |   name: single
14 |   metric: validation_loss
15 |   max_length:
16 |     epochs: 200
17 |   smaller_is_better: true
18 | entrypoint: model_def:GraphConvTrial
19 | environment:
20 |   image:
21 |     cuda: determinedai/environments:cuda-11.3-pytorch-1.12-tf-2.11-gpu-2b7e2a1
22 | resources:
23 |   slots_per_trial: 4
24 | 


--------------------------------------------------------------------------------
/features/ports_flask/README.md:
--------------------------------------------------------------------------------
 1 | # Determined experiment spinning off a flask server
 2 | 
 3 | This example includes two experiments:
 4 | 
 5 | 1. `hello-server`, a flask-based "hello world" web app.
 6 | 2. `hello-client`, which launches `hello-server`, waits for the server to stand up, makes a request to it, then kills it and shuts down.
 7 | 
 8 | To launch this example:
 9 | 
10 |     det e create hello-client.yaml . -f
11 | 
12 | Upon successful completion, you should see the following in the experiment logs:
13 | 
14 |     Got server response:  {'data': 'Hello World'}
15 |     SUCCESS!
16 |     Killed experiment <hello-server experiment id>
17 |     hello-server is killed.
18 | 


--------------------------------------------------------------------------------
/computer_vision/cifar10_pytorch/distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: cifar10_pytorch_distributed
 2 | description: An example experiment using Determined AI with CIFAR10, PyTorch and distributed multi-GPU training.
 3 | hyperparameters:
 4 |   learning_rate: 1.0e-4
 5 |   learning_rate_decay: 1.0e-6
 6 |   layer1_dropout: 0.25
 7 |   layer2_dropout: 0.25
 8 |   layer3_dropout: 0.5
 9 |   global_batch_size: 512 # Per-GPU batch size of 32
10 | resources:
11 |   slots_per_trial: 16
12 | records_per_epoch: 50000
13 | searcher:
14 |   name: single
15 |   metric: validation_error
16 |   max_length:
17 |     epochs: 32
18 | entrypoint: model_def:CIFARTrial
19 | min_validation_period:
20 |   epochs: 1
21 | 


--------------------------------------------------------------------------------
/computer_vision/cifar10_tf_keras/distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: cifar10_tf_keras_distributed
 2 | hyperparameters:
 3 |   learning_rate: 1.0e-4
 4 |   learning_rate_decay: 1.0e-6
 5 |   layer1_dropout: 0.25
 6 |   layer2_dropout: 0.25
 7 |   layer3_dropout: 0.5
 8 |   global_batch_size: 512 # Per-GPU batch size of 32
 9 |   width_factor: 0.1
10 |   height_factor: 0.1
11 |   horizontal_flip: True
12 | records_per_epoch: 50000
13 | resources:
14 |   slots_per_trial: 16 # Use 16 GPUs to train the model.
15 | searcher:
16 |   name: single
17 |   metric: val_categorical_error
18 |   max_length:
19 |     epochs: 32
20 | min_validation_period:
21 |   epochs: 1
22 | entrypoint: model_def:CIFARTrial
23 | 


--------------------------------------------------------------------------------
/blog/act-mem-2/README.md:
--------------------------------------------------------------------------------
 1 | # Activation Memory: Part 2
 2 | 
 3 | Code accompanying the deep-dive [blog post on activation memory](https://determined.ai/blog/act-mem-2).
 4 | 
 5 | - The main utility code is in `act_mem.py`. 
 6 | - Basic transformer layers are implemented in `layers.py`.
 7 | - The scripts `{block,mlp}_script.py` demonstrate how replacing `GELU` with `ReLU` affects activation
 8 | memory. 
 9 | - `attn_script.py` shows the cost of activation memory in the attention layer. 
10 | - Tests of the code are in `test.py`. 
11 | - See `requirements.txt` for versions the code was built against.
12 | 
13 | 
14 | ## Contributors
15 | 
16 | - [Garrett Goon](https://github.com/garrett361)


--------------------------------------------------------------------------------
/features/checkpoint_hooks_pytorch/layers.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | from determined.pytorch import TorchData
 7 | 
 8 | 
 9 | class Flatten(nn.Module):
10 |     def forward(self, *args: TorchData, **kwargs: Any) -> torch.Tensor:
11 |         assert len(args) == 1
12 |         x = args[0]
13 |         assert isinstance(x, torch.Tensor)
14 |         return x.contiguous().view(x.size(0), -1)
15 | 
16 | 
17 | class Squeeze(nn.Module):
18 |     def forward(self, *args: TorchData, **kwargs: Any) -> torch.Tensor:
19 |         assert len(args) == 1
20 |         x = args[0]
21 |         assert isinstance(x, torch.Tensor)
22 |         return torch.squeeze(x)
23 | 


--------------------------------------------------------------------------------
/fsdp/minimal_fsdp/config.yaml:
--------------------------------------------------------------------------------
 1 | name: fsdp example
 2 | entrypoint: python3 -m determined.launch.torch_distributed -- python3 fsdp.py
 3 | searcher:
 4 |   name: single
 5 |   metric: loss
 6 |   max_length: 100
 7 | resources:
 8 |   slots_per_trial: 2
 9 | environment:
10 |   image:
11 |     gpu: determinedai/environments:cuda-11.8-pytorch-2.0-gpu-mpi-0.31.1
12 | hyperparameters:
13 |   batch_size: 1
14 |   lr: 1e-4
15 |   d_model: 512
16 |   max_seq_len: 2048
17 |   n_heads: 8
18 |   n_layers: 4
19 |   vocab_size: 32000
20 |   report_rate: 10
21 |   checkpoint_rate: 50
22 |   amp_dtype: float16
23 |   validation_batches: 10
24 |   core_api_profiler: false
25 |   torch_profiler: false
26 | max_restarts: 0
27 | 


--------------------------------------------------------------------------------
/features/custom_reducers_mnist_pytorch/layers.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | from determined.pytorch import TorchData
 7 | 
 8 | 
 9 | class Flatten(nn.Module):
10 |     def forward(self, *args: TorchData, **kwargs: Any) -> torch.Tensor:
11 |         assert len(args) == 1
12 |         x = args[0]
13 |         assert isinstance(x, torch.Tensor)
14 |         return x.contiguous().view(x.size(0), -1)
15 | 
16 | 
17 | class Squeeze(nn.Module):
18 |     def forward(self, *args: TorchData, **kwargs: Any) -> torch.Tensor:
19 |         assert len(args) == 1
20 |         x = args[0]
21 |         assert isinstance(x, torch.Tensor)
22 |         return torch.squeeze(x)
23 | 


--------------------------------------------------------------------------------
/features/hp_constraints_mnist_pytorch/layers.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | from determined.pytorch import TorchData
 7 | 
 8 | 
 9 | class Flatten(nn.Module):
10 |     def forward(self, *args: TorchData, **kwargs: Any) -> torch.Tensor:
11 |         assert len(args) == 1
12 |         x = args[0]
13 |         assert isinstance(x, torch.Tensor)
14 |         return x.contiguous().view(x.size(0), -1)
15 | 
16 | 
17 | class Squeeze(nn.Module):
18 |     def forward(self, *args: TorchData, **kwargs: Any) -> torch.Tensor:
19 |         assert len(args) == 1
20 |         x = args[0]
21 |         assert isinstance(x, torch.Tensor)
22 |         return torch.squeeze(x)
23 | 


--------------------------------------------------------------------------------
/computer_vision/unets_tf_keras/const.yaml:
--------------------------------------------------------------------------------
 1 | name: unets_tf_keras_const
 2 | data:
 3 |   BUFFER_SIZE: 1000
 4 |   data_file: mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_128_no_top.h5
 5 | 
 6 | hyperparameters:
 7 |   learning_rate: 1.0e-4
 8 |   learning_rate_decay: 1.0e-6
 9 |   layer1_dense_size: 16
10 |   global_batch_size: 64
11 |   OUTPUT_CHANNELS: 3
12 | 
13 | searcher:
14 |   name: single
15 |   metric: val_accuracy
16 |   smaller_is_better: false
17 |   max_length:
18 |     batches: 1140
19 | 
20 | min_validation_period:
21 |   batches: 57
22 | entrypoint: model_def:UNetsTrial
23 | scheduling_unit: 57
24 | environment:
25 |     image: determinedai/environments:cuda-11.3-pytorch-1.12-tf-2.11-gpu-2b7e2a1
26 | 


--------------------------------------------------------------------------------
/custom_search_method/asha_search_method/experiment_files/layers.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | from determined.pytorch import TorchData
 7 | 
 8 | 
 9 | class Flatten(nn.Module):
10 |     def forward(self, *args: TorchData, **kwargs: Any) -> torch.Tensor:
11 |         assert len(args) == 1
12 |         x = args[0]
13 |         assert isinstance(x, torch.Tensor)
14 |         return x.contiguous().view(x.size(0), -1)
15 | 
16 | 
17 | class Squeeze(nn.Module):
18 |     def forward(self, *args: TorchData, **kwargs: Any) -> torch.Tensor:
19 |         assert len(args) == 1
20 |         x = args[0]
21 |         assert isinstance(x, torch.Tensor)
22 |         return torch.squeeze(x)
23 | 


--------------------------------------------------------------------------------
/features/custom_reducers_mnist_pytorch/distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: custom_reducers_mnist_pytorch_distributed
 2 | data:
 3 |   url: https://s3-us-west-2.amazonaws.com/determined-ai-test-data/pytorch_mnist.tar.gz
 4 | hyperparameters:
 5 |   learning_rate: 1.0
 6 |   global_batch_size: 512
 7 |   n_filters1: 32
 8 |   n_filters2: 64
 9 |   dropout1: 0.25
10 |   dropout2: 0.5
11 | resources:
12 |   slots_per_trial: 8
13 | searcher:
14 |   name: single
15 |   metric: validation_loss
16 |   max_length:
17 |       batches: 117  #60,000 training images with batch size 512 (batch size 64 per GPU)
18 |   smaller_is_better: true
19 | entrypoint: model_def:MNistTrial
20 | 
21 | # Show off validation metrics.
22 | min_validation_period:
23 |   batches: 100
24 | 


--------------------------------------------------------------------------------
/nlp/bert_glue_pytorch/distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: bert_glue_pytorch_distributed 
 2 | hyperparameters:
 3 |   global_batch_size: 192 # per gpu batch size of 24
 4 |   learning_rate: 2.0e-5
 5 |   lr_scheduler_epoch_freq: 1
 6 |   model_type: 'bert'
 7 |   adam_epsilon: 1.0e-8
 8 |   weight_decay: 0
 9 |   num_warmup_steps: 0
10 |   num_training_steps: 459
11 |   max_seq_length: 128
12 | searcher:
13 |   name: single
14 |   metric: acc
15 |   max_length:
16 |     batches: 50
17 |   smaller_is_better: false
18 | resources:
19 |     slots_per_trial: 8
20 | data:
21 |   task: 'MRPC'
22 |   model_name_or_path: "bert-base-uncased"
23 |   output_mode: "classification"
24 |   path_to_mrpc: ''
25 |   download_data: True
26 | entrypoint: model_def:BertPyTorch
27 | 


--------------------------------------------------------------------------------
/computer_vision/detr_coco_pytorch/startup-hook.sh:
--------------------------------------------------------------------------------
 1 | apt-get update
 2 | apt-get install unzip
 3 | 
 4 | wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
 5 | unzip -o annotations_trainval2017.zip
 6 | mv annotations/instances_train2017.json /tmp
 7 | mv annotations/instances_val2017.json /tmp
 8 | 
 9 | git clone https://github.com/facebookresearch/detr.git
10 | cd detr && git reset --hard 4e1a9281bc5621dcd65f3438631de25e255c4269
11 | # Need to fix a bug in the original code that fails to handle torchvision version 0.10 correctly.
12 | sed -i 's/float(torchvision\.__version__\[:3\]) < 0.7/int(torchvision\.__version__.split("\.")\[1\]) < 7/g' util/misc.py
13 | cd ..
14 | 
15 | pip install attrdict
16 | pip install pycocotools
17 | 


--------------------------------------------------------------------------------
/computer_vision/iris_tf_keras/adaptive.yaml:
--------------------------------------------------------------------------------
 1 | name: iris_tf_keras_adaptive_search
 2 | data:
 3 |   train_url: http://download.tensorflow.org/data/iris_training.csv
 4 |   test_url: http://download.tensorflow.org/data/iris_test.csv
 5 | hyperparameters:
 6 |   learning_rate:
 7 |     type: log
 8 |     minval: -5.0
 9 |     maxval: 1.0
10 |     base: 10.0
11 |   learning_rate_decay: 1.0e-6
12 |   layer1_dense_size:
13 |     type: int
14 |     minval: 4
15 |     maxval: 32
16 |   global_batch_size:
17 |     type: int
18 |     minval: 5
19 |     maxval: 30
20 | searcher:
21 |   name: adaptive_asha
22 |   metric: val_categorical_accuracy
23 |   smaller_is_better: false
24 |   max_length:
25 |     batches: 6400
26 |   max_trials: 512
27 | entrypoint: model_def:IrisTrial
28 | 


--------------------------------------------------------------------------------
/gan/cyclegan/1-gpu.yaml:
--------------------------------------------------------------------------------
 1 | description: Cycle GAN Pytorch 1 GPU
 2 | data:
 3 |   downloaded_path: /tmp
 4 |   dataset_name: monet2photo
 5 |   n_cpu: 8
 6 |   img_height: 256
 7 |   img_width: 256
 8 |   channels: 3
 9 |   sample_interval: 3000
10 | hyperparameters:
11 |   global_batch_size: 1
12 |   lr: 0.0002
13 |   b1: 0.5
14 |   b2: 0.999
15 |   decay_epoch: 100  # epoch from which to start lr decay
16 |   n_residual_blocks: 9  # number of residual blocks in generator
17 |   lambda_cyc: 10.0
18 |   lambda_id: 5.0
19 | records_per_epoch: 6287
20 | searcher:
21 |   name: single
22 |   metric: loss_real_D
23 |   max_length:
24 |     epochs: 2000
25 |   smaller_is_better: True
26 | entrypoint: determined_model_def:CycleGANTrial
27 | min_checkpoint_period:
28 |   epochs: 1
29 | 


--------------------------------------------------------------------------------
/gan/pix2pix_tf_keras/const.yaml:
--------------------------------------------------------------------------------
 1 | name: pix2pix_facades_const
 2 | data:
 3 |   base: http://efrosgans.eecs.berkeley.edu/pix2pix/datasets
 4 |   dataset: facades
 5 |   BUFFER_SIZE: 400
 6 |   height: 256
 7 |   width: 256
 8 | hyperparameters:
 9 |   global_batch_size: 1
10 |   discriminator_lr: 2e-4
11 |   discriminator_beta_1: 0.5
12 |   generator_lr: 2e-4
13 |   generator_beta_1: 0.5
14 |   jitter: 30
15 |   mirror: true
16 | records_per_epoch: 400  # There are 400 images in the facades training set
17 | min_validation_period:
18 |   batches: 40
19 | min_checkpoint_period:
20 |   batches: 400
21 | searcher:
22 |   name: single
23 |   metric: val_total_loss
24 |   smaller_is_better: true
25 |   max_length:
26 |     batches: 4000
27 | entrypoint: model_def:Pix2PixTrial
28 | 


--------------------------------------------------------------------------------
/blog/python_sdk_demo/mednist_model/config.yaml:
--------------------------------------------------------------------------------
 1 | hyperparameters:
 2 |     global_batch_size: 128
 3 |     lr: 0.001
 4 |     weight_decay:
 5 |         type: log
 6 |         base: 10
 7 |         minval: -4
 8 |         maxval: -1
 9 |     beta1:
10 |         type: double
11 |         minval: 0.1
12 |         maxval: 0.999
13 |     beta2:
14 |         type: double
15 |         minval: 0.1
16 |         maxval: 0.999
17 |     gamma: 0.1
18 | min_validation_period:
19 |     epochs: 1
20 | searcher:
21 |     name: adaptive_asha
22 |     metric: val_accuracy
23 |     smaller_is_better: false
24 |     max_length: 
25 |       epochs: 1
26 |     max_trials: 3
27 |     mode: aggressive
28 | resources:
29 |     slots_per_trial: 1
30 | entrypoint: model_def:MyMEDMnistTrial
31 | max_restarts: 0
32 | 
33 | 


--------------------------------------------------------------------------------
/computer_vision/unets_tf_keras/distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: unets_tf_keras_distributed
 2 | data:
 3 |   BUFFER_SIZE: 1000
 4 |   data_file: mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_128_no_top.h5
 5 | 
 6 | hyperparameters:
 7 |   learning_rate: 1.0e-4
 8 |   learning_rate_decay: 1.0e-6
 9 |   layer1_dense_size: 16
10 |   global_batch_size: 512 # per slot batch size = 64 
11 |   OUTPUT_CHANNELS: 3
12 | 
13 | searcher:
14 |   name: single
15 |   metric: val_accuracy
16 |   smaller_is_better: false
17 |   max_length:
18 |     batches: 160 
19 | 
20 | resources:
21 |     slots_per_trial: 8
22 | 
23 | min_validation_period:
24 |   batches: 8
25 | scheduling_unit: 8
26 | entrypoint: model_def:UNetsTrial
27 | environment:
28 |     image: determinedai/environments:cuda-11.3-pytorch-1.12-tf-2.11-gpu-2b7e2a1
29 | 


--------------------------------------------------------------------------------
/gan/cyclegan/8-gpus.yaml:
--------------------------------------------------------------------------------
 1 | description: Cycle GAN Pytorch 8 GPUs
 2 | data:
 3 |   downloaded_path: /tmp
 4 |   dataset_name: monet2photo
 5 |   n_cpu: 8
 6 |   img_height: 256
 7 |   img_width: 256
 8 |   channels: 3
 9 |   sample_interval: 3000
10 | hyperparameters:
11 |   global_batch_size: 8
12 |   lr: 0.0002
13 |   b1: 0.5
14 |   b2: 0.999
15 |   decay_epoch: 100  # epoch from which to start lr decay
16 |   n_residual_blocks: 9  # number of residual blocks in generator
17 |   lambda_cyc: 10.0
18 |   lambda_id: 5.0
19 | records_per_epoch: 6287
20 | searcher:
21 |   name: single
22 |   metric: loss_real_D
23 |   max_length:
24 |     epochs: 2000
25 |   smaller_is_better: True
26 | entrypoint: determined_model_def:CycleGANTrial
27 | resources:
28 |   slots_per_trial: 8
29 | min_checkpoint_period:
30 |   epochs: 1
31 | 


--------------------------------------------------------------------------------
/gan/cyclegan/64-gpus.yaml:
--------------------------------------------------------------------------------
 1 | description: Cycle GAN Pytorch 64 GPUs
 2 | data:
 3 |   downloaded_path: /tmp
 4 |   dataset_name: monet2photo
 5 |   n_cpu: 8
 6 |   img_height: 256
 7 |   img_width: 256
 8 |   channels: 3
 9 |   sample_interval: 3000
10 | hyperparameters:
11 |   global_batch_size: 64
12 |   lr: 0.0002
13 |   b1: 0.5
14 |   b2: 0.999
15 |   decay_epoch: 100  # epoch from which to start lr decay
16 |   n_residual_blocks: 9  # number of residual blocks in generator
17 |   lambda_cyc: 10.0
18 |   lambda_id: 5.0
19 | records_per_epoch: 6287
20 | searcher:
21 |   name: single
22 |   metric: loss_real_D
23 |   max_length:
24 |     epochs: 2000
25 |   smaller_is_better: True
26 | entrypoint: determined_model_def:CycleGANTrial
27 | resources:
28 |   slots_per_trial: 64
29 | min_checkpoint_period:
30 |   epochs: 1
31 | 


--------------------------------------------------------------------------------
/gan/pix2pix_tf_keras/distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: pix2pix_facades_distributed
 2 | data:
 3 |   base: http://efrosgans.eecs.berkeley.edu/pix2pix/datasets
 4 |   dataset: facades
 5 |   BUFFER_SIZE: 400
 6 |   height: 256
 7 |   width: 256
 8 | hyperparameters:
 9 |   global_batch_size: 4
10 |   discriminator_lr: 2e-4
11 |   discriminator_beta_1: 0.5
12 |   generator_lr: 2e-4
13 |   generator_beta_1: 0.5
14 |   jitter: 30
15 |   mirror: true
16 | records_per_epoch: 400  # There are 400 images in the facades training set
17 | min_validation_period:
18 |   batches: 40
19 | min_checkpoint_period:
20 |   batches: 400
21 | searcher:
22 |   name: single
23 |   metric: val_total_loss
24 |   smaller_is_better: true
25 |   max_length:
26 |     batches: 4000
27 | entrypoint: model_def:Pix2PixTrial
28 | resources:
29 |   slots_per_trial: 4
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # All log files
 6 | *.log
 7 | 
 8 | # Jupyter Notebook
 9 | .ipynb_checkpoints
10 | 
11 | # pyenv
12 | .python-version
13 | 
14 | # dotenv
15 | .env
16 | 
17 | # virtualenv
18 | .venv
19 | venv/
20 | ENV/
21 | 
22 | # mypy
23 | .mypy_cache/
24 | 
25 | # Determined distributable package
26 | determined-*.tar.gz
27 | 
28 | # All Python wheels
29 | *.whl
30 | 
31 | # Node modules
32 | node_modules/
33 | 
34 | # VSCode
35 | .vscode/
36 | 
37 | # JetBrains IDEs (e.g., PyCharm and GoLand)
38 | .idea/
39 | 
40 | # gobin directory used for tests
41 | gobin
42 | 
43 | # MacOS system files
44 | *.DS_Store
45 | .dccache
46 | 
47 | # Hydra output
48 | model_hub/mmdetection/hydra/outputs
49 | 
50 | build/


--------------------------------------------------------------------------------
/model_hub/mmdetection/hydra/configs/config.yaml:
--------------------------------------------------------------------------------
 1 | name: model_hub_mmdet_experiment
 2 | defaults:
 3 |     - data: disk
 4 |     - profiling: disabled
 5 |     - searcher: single
 6 |     - hyperparameters:
 7 |         - base
 8 | 
 9 | checkpoint_storage:
10 |   save_trial_latest: 5
11 | 
12 | min_validation_period:
13 |   batches: 7320
14 | 
15 | environment:
16 |   image:
17 |     gpu: determinedai/model-hub-mmdetection:0.26.2-dev0
18 |   environment_variables:
19 |       - OMP_NUM_THREADS=1 # Following pytorch dtrain, this environment variable is set to 1 to avoid overloading the system.
20 | 
21 | resources:
22 |   slots_per_trial: 8 # max number of GPUs a trial is allowed to individually use
23 |   shm_size: 200000000000
24 | entrypoint: python3 -m determined.launch.torch_distributed --trial model_hub.mmdetection:MMDetTrial
25 | 


--------------------------------------------------------------------------------
/computer_vision/detectron2_coco_pytorch/const.yaml:
--------------------------------------------------------------------------------
 1 | name: detectron2_const
 2 | environment:
 3 |     image: "determinedai/example-detectron2:0.6-cuda-10.2-pytorch-1.10"
 4 |     environment_variables:
 5 |       - DETECTRON2_DATASETS=/mnt/dtrain-fsx/detectron2
 6 | hyperparameters:
 7 |   global_batch_size: 16 # Detectron defaults to 16 regardless of N GPUs
 8 |   model_yaml: mask_rcnn_R_50_FPN_noaug_1x.yaml
 9 |   output_dir: None
10 |   fake_data: False
11 | searcher:
12 |   name: single
13 |   metric: bboxAP
14 |   max_length: 
15 |     batches: 90000
16 |   smaller_is_better: false
17 | resources:
18 |     slots_per_trial: 1
19 | entrypoint: model_def:DetectronTrial
20 | bind_mounts:
21 |   - host_path: /path/to/data
22 |     container_path: /mnt/dtrain-fsx/detectron2
23 |     read_only: true
24 | min_validation_period:
25 |   batches: 5000
26 | 


--------------------------------------------------------------------------------
/computer_vision/byol_pytorch/backbone.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Callable
 3 | 
 4 | import torch.nn as nn
 5 | import torchvision.models as models
 6 | 
 7 | 
 8 | @dataclass
 9 | class BackboneMetadata:
10 |     feature_size: int
11 |     build_fn: Callable[[], nn.Module]
12 | 
13 | 
14 | BACKBONE_METADATA_BY_NAME = {
15 |     "resnet18": BackboneMetadata(
16 |         feature_size=512, build_fn=lambda: models.resnet18(pretrained=True)
17 |     ),
18 |     "resnet34": BackboneMetadata(
19 |         feature_size=512, build_fn=lambda: models.resnet34(pretrained=True)
20 |     ),
21 |     "resnet50": BackboneMetadata(
22 |         feature_size=2048, build_fn=lambda: models.resnet50(pretrained=True)
23 |     ),
24 |     "resnet101": BackboneMetadata(
25 |         feature_size=2048, build_fn=lambda: models.resnet101(pretrained=True)
26 |     ),
27 | }
28 | 


--------------------------------------------------------------------------------
/features/hp_constraints_mnist_pytorch/adaptive.yaml:
--------------------------------------------------------------------------------
 1 | name: mnist_pytorch_constrained_adaptive
 2 | data:
 3 |   url: https://s3-us-west-2.amazonaws.com/determined-ai-test-data/pytorch_mnist.tar.gz
 4 | hyperparameters:
 5 |   global_batch_size: 64
 6 |   learning_rate:
 7 |     type: double
 8 |     minval: .0001
 9 |     maxval: 1.0
10 |   n_filters1:
11 |     type: int
12 |     minval: 8
13 |     maxval: 64
14 |   n_filters2:
15 |     type: int
16 |     minval: 8
17 |     maxval: 72
18 |   dropout1:
19 |     type: double
20 |     minval: .2
21 |     maxval: .8
22 |   dropout2:
23 |     type: double
24 |     minval: .2
25 |     maxval: .8
26 | searcher:
27 |   name: adaptive_asha
28 |   metric: validation_loss
29 |   smaller_is_better: true
30 |   max_trials: 16
31 |   max_length:
32 |     batches: 937 #60,000 training images with batch size 64
33 | entrypoint: model_def:MNistTrial
34 | 


--------------------------------------------------------------------------------
/computer_vision/detectron2_coco_pytorch/distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: detectron2_distributed
 2 | environment:
 3 |     image: "determinedai/example-detectron2:0.6-cuda-10.2-pytorch-1.10"
 4 |     environment_variables:
 5 |       - DETECTRON2_DATASETS=/mnt/dtrain-fsx/detectron2
 6 | hyperparameters:
 7 |   global_batch_size: 16 # Detectron defaults to 16 regardless of N GPUs
 8 |   model_yaml: mask_rcnn_R_50_FPN_noaug_1x.yaml
 9 |   output_dir: None
10 |   fake_data: False
11 | searcher:
12 |   name: single
13 |   metric: bboxAP
14 |   max_length: 
15 |     batches: 90000
16 |   smaller_is_better: false
17 | resources:
18 |     slots_per_trial: 4
19 |     shm_size: 824600000000
20 | entrypoint: model_def:DetectronTrial
21 | bind_mounts:
22 |   - host_path: /path/to/data
23 |     container_path: /mnt/dtrain-fsx/detectron2
24 |     read_only: true
25 | min_validation_period:
26 |   batches: 5000
27 | 


--------------------------------------------------------------------------------
/meta_learning/protonet_omniglot_pytorch/fetch_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Source: https://github.com/alshedivat/meta-blocks/blob/master/benchmarks/omniglot/fetch_data.sh
 4 | # Fetch Omniglot.
 5 | #
 6 | 
 7 | OMNIGLOT_URL=https://raw.githubusercontent.com/brendenlake/omniglot/master/python
 8 | 
 9 | set -e
10 | 
11 | mkdir tmp
12 | trap 'rm -r tmp' EXIT
13 | 
14 | if [ ! -d data ]; then
15 |     mkdir data
16 | fi
17 | 
18 | if [ ! -d data/omniglot ]; then
19 |     mkdir tmp/omniglot
20 |     for name in images_background images_evaluation; do
21 |         echo "Fetching omniglot/$name ..."
22 |         curl -# "$OMNIGLOT_URL/$name.zip" >"tmp/$name.zip"
23 |         echo "Extracting omniglot/$name ..."
24 |         unzip -q "tmp/$name.zip" -d tmp
25 |         rm "tmp/$name.zip"
26 |         mv tmp/$name/* tmp/omniglot
27 |     done
28 |     mv tmp/omniglot data/omniglot
29 | fi
30 | 


--------------------------------------------------------------------------------
/blog/act-mem-2/attn_script.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | import act_mem
 4 | import layers
 5 | 
 6 | if __name__ == "__main__":
 7 |     batch_size, seq_len, d_model, n_heads = 2, 4096, 1024, 32
 8 |     dtype = torch.bfloat16
 9 |     inputs = torch.randn(
10 |         batch_size,
11 |         seq_len,
12 |         d_model,
13 |         device="cuda",
14 |         requires_grad=True,
15 |         dtype=dtype,
16 |     )
17 | 
18 |     attn = layers.Attention(
19 |         d_model=d_model,
20 |         n_heads=n_heads,
21 |         device="cuda",
22 |         dtype=dtype,
23 |     )
24 |     with act_mem.AllocatedMemContext() as mem, act_mem.SavedTensorContext(
25 |         ignored_tensors=attn.parameters()
26 |     ) as saved:
27 |         out = attn(inputs)
28 |     print(f'{mem.delta["current"]=}')
29 |     print(f"{saved.saved_tensor_mem=}")
30 |     print(f"{saved.saved_tensor_mem/out.numel()=}")
31 | 


--------------------------------------------------------------------------------
/graphs/proteins_pytorch_geometric/adaptive.yaml:
--------------------------------------------------------------------------------
 1 | name: proteins_pytorch_geometric_adaptive
 2 | hyperparameters:
 3 |   global_batch_size:
 4 |     type: int
 5 |     minval: 16
 6 |     maxval: 128
 7 |   dataset: PROTEINS
 8 |   lr:
 9 |     type: log
10 |     base: 10.0
11 |     minval: -6
12 |     maxval: -1
13 |   topk_pooling_ratio:
14 |     type: double
15 |     minval: 0.1
16 |     maxval: 0.9
17 |   dropout:
18 |     type: double
19 |     minval: 0.2
20 |     maxval: 0.8
21 |   training_records: 890
22 | records_per_epoch: 890
23 | min_validation_period:
24 |   epochs: 1
25 | searcher:
26 |   name: adaptive_asha
27 |   metric: validation_loss
28 |   max_length:
29 |     epochs: 200
30 |   smaller_is_better: true
31 |   max_trials: 1000
32 | entrypoint: model_def:GraphConvTrial
33 | environment:
34 |   image:
35 |     cuda: determinedai/environments:cuda-11.3-pytorch-1.12-tf-2.11-gpu-2b7e2a1
36 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning-3/chat_format.py:
--------------------------------------------------------------------------------
 1 | CHAT_ML_TEMPLATE = """
 2 | {% for message in messages %}
 3 | {% if message['role'] == 'user' %}
 4 | {{'<|im_start|>user\n' + message['content'].strip() + '<|im_end|>' }}
 5 | {% elif message['role'] == 'system' %}
 6 | {{'<|im_start|>system\n' + message['content'].strip() + '<|im_end|>' }}
 7 | {% elif message['role'] == 'assistant' %}
 8 | {{'<|im_start|>assistant\n'  + message['content'] + '<|im_end|>' }}
 9 | {% endif %}
10 | {% endfor %}
11 | """
12 | 
13 | CHAT_ML_END_TURN_TOKEN = "<|im_end|>"
14 | CHAT_ML_START_TURN_TOKEN = "<|im_start|>"
15 | 
16 | 
17 | def get_assistant_prompt():
18 |     return "<|im_start|>assistant\n"
19 | 
20 | 
21 | def get_response_template_ids(tokenizer):
22 |     return tokenizer.encode(get_assistant_prompt(), add_special_tokens=False)
23 | 
24 | 
25 | def maybe_add_generation_prompt(text: str) -> str:
26 |     return text + get_assistant_prompt()
27 | 


--------------------------------------------------------------------------------
/computer_vision/cifar10_pytorch/adaptive.yaml:
--------------------------------------------------------------------------------
 1 | name: cifar10_pytorch_adaptive_search
 2 | description: An example experiment of hyperparameter tuning using Determined AI with CIFAR10 and PyTorch.
 3 | hyperparameters:
 4 |   learning_rate:
 5 |     type: log
 6 |     minval: -5.0
 7 |     maxval: 1.0
 8 |     base: 10.0
 9 |   learning_rate_decay: 1.0e-6
10 |   layer1_dropout:
11 |     type: double
12 |     minval: 0.2
13 |     maxval: 0.5
14 |   layer2_dropout:
15 |     type: double
16 |     minval: 0.2
17 |     maxval: 0.5
18 |   layer3_dropout:
19 |     type: double
20 |     minval: 0.2
21 |     maxval: 0.5
22 |   global_batch_size:
23 |     type: int
24 |     minval: 16
25 |     maxval: 64
26 | records_per_epoch: 50000
27 | searcher:
28 |   name: adaptive_asha
29 |   metric: validation_error
30 |   max_length:
31 |     epochs: 32
32 |   max_trials: 16
33 | entrypoint: model_def:CIFARTrial
34 | min_validation_period:
35 |   epochs: 1
36 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning/distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: Text-to-SQL
 2 | debug: false
 3 | environment:
 4 |   environment_variables:
 5 |     - NCCL_DEBUG=INFO
 6 | resources:
 7 |   slots_per_trial: 2
 8 | searcher:
 9 |   name: single
10 |   max_length:
11 |     batches: 5000
12 |   metric: eval_accuracy
13 |   smaller_is_better: false
14 | hyperparameters:
15 |   model: "TinyLlama/TinyLlama-1.1B-Chat-v0.4"
16 |   dataset_subset: "easy"
17 |   training_args:
18 |     output_dir: "/tmp/llm_finetuning"
19 |     max_steps: 5000
20 |     per_device_train_batch_size: 1
21 |     per_device_eval_batch_size: 4
22 |     fp16: true
23 |     evaluation_strategy: "steps"
24 |     eval_steps: 1000
25 |     logging_strategy: "steps"
26 |     logging_steps: 100
27 |     save_strategy: "steps"
28 |     save_steps: 1000
29 |     learning_rate: 1e-5
30 | entrypoint: >-
31 |   python -m determined.launch.torch_distributed
32 |   python finetune.py
33 | max_restarts: 0
34 | 


--------------------------------------------------------------------------------
/nlp/bert_glue_pytorch/constants.py:
--------------------------------------------------------------------------------
 1 | from transformers import (
 2 |     BertConfig,
 3 |     BertForSequenceClassification,
 4 |     BertTokenizer,
 5 |     DistilBertConfig,
 6 |     DistilBertForSequenceClassification,
 7 |     DistilBertTokenizer,
 8 |     RobertaConfig,
 9 |     RobertaForSequenceClassification,
10 |     RobertaTokenizer,
11 |     XLMConfig,
12 |     XLMForSequenceClassification,
13 |     XLMTokenizer,
14 |     XLNetConfig,
15 |     XLNetForSequenceClassification,
16 |     XLNetTokenizer,
17 | )
18 | 
19 | # Lookup for classes
20 | MODEL_CLASSES = {
21 |     "bert": (BertConfig, BertForSequenceClassification, BertTokenizer),
22 |     "xlnet": (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
23 |     "xlm": (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
24 |     "roberta": (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
25 |     "distilbert": (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
26 | }
27 | 


--------------------------------------------------------------------------------
/computer_vision/cifar10_pytorch/distributed_inference.yaml:
--------------------------------------------------------------------------------
 1 | name: cifar10_pytorch_distributed_inference
 2 | description: An example using Determined AI with CIFAR10, PyTorch and distributed batch inference.
 3 | entrypoint: >-
 4 |   python3 -m determined.launch.torch_distributed
 5 |   python3 inference_example.py
 6 | resources:
 7 |   slots_per_trial: 2
 8 | searcher:
 9 |   name: grid
10 |   metric: x
11 |   max_length: 100
12 | hyperparameters:
13 |   # Replace with the name of the model to run inference on
14 |   model_name: cifar_checkpoints
15 |   # Replace with the model versions to run inference on
16 |   model_version:
17 |     type: categorical
18 |     vals:
19 |       - 1
20 |       - 2
21 |       - 3
22 |       - 4
23 |       - 5
24 |       - 6
25 |       - 7
26 |       - 8
27 |       - 9
28 |       - 10
29 |       - 11
30 |       - 12
31 |       - 13
32 |       - 14
33 | max_restarts: 0
34 | bind_mounts:
35 |   - host_path: /tmp
36 |     container_path: /tmp
37 |     read_only: false
38 | 


--------------------------------------------------------------------------------
/deepspeed/deepspeed_dcgan/mnist.yaml:
--------------------------------------------------------------------------------
 1 | name: dcgan_deepspeed_mnist
 2 | data:
 3 |   dataroot: /data
 4 |   dataset: mnist
 5 |   image_size: 64
 6 | hyperparameters:
 7 |   deepspeed_config: ds_config.json
 8 |   noise_length: 100
 9 |   generator_width_base: 64
10 |   discriminator_width_base: 64
11 |   data_workers: 16
12 | environment:
13 |   environment_variables:
14 |     - NCCL_DEBUG=INFO
15 |     # You may need to modify this to match your network configuration.
16 |     - NCCL_SOCKET_IFNAME=ens,eth,ib
17 |   image:
18 |     gpu: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.22.1
19 | bind_mounts:
20 |   - host_path: /tmp
21 |     container_path: /data
22 | resources:
23 |   slots_per_trial: 2
24 | searcher:
25 |   name: single
26 |   metric: no_validation_metric
27 |   max_length:
28 |     batches: 100000
29 | min_validation_period:
30 |   batches: 0
31 | entrypoint:
32 |   - python3
33 |   - -m
34 |   - determined.launch.deepspeed
35 |   - --trial
36 |   - model_def:DCGANTrial
37 | max_restarts: 0
38 | 


--------------------------------------------------------------------------------
/nas/gaea_pytorch/search/optimizer.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.optim.optimizer import Optimizer, required
 3 | 
 4 | 
 5 | class EG(Optimizer):
 6 |     def __init__(self, params, lr=required, normalize_fn=lambda x: x):
 7 |         if lr is not required and lr < 0.0:
 8 |             raise ValueError("Invalid learning rate: {}".format(lr))
 9 |         self.normalize_fn = normalize_fn
10 |         defaults = dict(lr=lr)
11 |         super(EG, self).__init__(params, defaults)
12 | 
13 |     @torch.no_grad()
14 |     def step(self, closure=None):
15 |         loss = None
16 |         if closure is not None:
17 |             with torch.enable_grad():
18 |                 loss = closure()
19 | 
20 |         for group in self.param_groups:
21 |             for p in group["params"]:
22 |                 if p.grad is None:
23 |                     continue
24 |                 d_p = p.grad
25 |                 p.mul_(torch.exp(-group["lr"] * d_p))
26 |                 p.data = self.normalize_fn(p.data)
27 | 
28 |         return loss
29 | 


--------------------------------------------------------------------------------
/nlp/word_language_model/const.yaml:
--------------------------------------------------------------------------------
 1 | name: word_language_modeling_const
 2 | hyperparameters:
 3 |     global_batch_size: 20
 4 |     eval_batch_size: 10
 5 |     max_grad_norm: 0.25
 6 |     model_cls: Transformer
 7 |     # model_cls: LSTM
 8 |     # model_cls: GRU
 9 |     word_embeddings_size: 200
10 |     num_hidden: 200
11 |     num_layers: 2
12 |     dropout: 0.2
13 |     bptt: 35
14 |     lr: 20
15 |     # Transformer Model Only Hyperparameters
16 |     num_heads: 2
17 |     # LSTM/GRU Model Only Hyperparameters
18 |     # tied: False
19 | resources:
20 |     slots_per_trial: 1
21 | records_per_epoch: 59660
22 | searcher:
23 |     name: single
24 |     metric: validation_loss
25 |     max_length:
26 |         epochs: 40
27 |     smaller_is_better: true
28 | min_validation_period:
29 |     epochs: 1
30 | data:
31 |     use_bind_mount: True
32 |     bind_mount_path: /data
33 |     use_cache: True
34 | entrypoint: model_def:WordLanguageModelTrial
35 | bind_mounts:
36 |     - host_path: /tmp
37 |       container_path: /data
38 |       read_only: false


--------------------------------------------------------------------------------
/nas/gaea_pytorch/search/const.yaml:
--------------------------------------------------------------------------------
 1 | name: gaea_search
 2 | 
 3 | data:
 4 |   download_dir: /data
 5 | 
 6 | bind_mounts:
 7 |   - host_path: /tmp
 8 |     container_path: /data
 9 |     read_only: false
10 | 
11 | hyperparameters:
12 |     # Number of classes in dataset
13 |     n_classes: 10
14 |     # Channel shuffle factor.  1 / shuffle_factor channels are activated at a given time.
15 |     shuffle_factor: 4
16 |     global_batch_size: 256
17 |     learning_rate: 0.1
18 |     momentum: 0.9
19 |     min_learning_rate: 0
20 |     scheduler_epochs: 50
21 |     weight_decay: 3.0e-4
22 |     arch_learning_rate: 0.1
23 |     init_channels: 16
24 |     layers: 8
25 |     nodes: 4
26 | 
27 | resources:
28 |   slots_per_trial: 2
29 | 
30 | min_validation_period: 
31 |   batches: 100
32 | 
33 | records_per_epoch: 25000
34 | searcher:
35 |   name: single
36 |   metric: top1_accuracy 
37 |   smaller_is_better: false 
38 |   max_length: 
39 |     epochs: 50
40 | 
41 | optimizations:
42 |   aggregation_frequency:  1
43 | 
44 | entrypoint: model_def:GAEASearchTrial
45 | 


--------------------------------------------------------------------------------
/nlp/word_language_model/distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: word_language_modeling_distributed 
 2 | hyperparameters:
 3 |     global_batch_size: 50
 4 |     eval_batch_size: 10
 5 |     max_grad_norm: 0.25
 6 |     model_cls: Transformer
 7 |     # model_cls: LSTM
 8 |     # model_cls: GRU
 9 |     word_embeddings_size: 200
10 |     num_hidden: 200
11 |     num_layers: 2
12 |     dropout: 0.2
13 |     bptt: 35
14 |     lr: 20
15 |     # Transformer Model Only Hyperparameters
16 |     num_heads: 2
17 |     # LSTM/GRU Model Only Hyperparameters
18 |     # tied: False
19 | resources:
20 |     slots_per_trial: 8
21 | records_per_epoch: 59660
22 | searcher:
23 |     name: single
24 |     metric: validation_loss
25 |     max_length:
26 |         epochs: 40
27 |     smaller_is_better: true
28 | min_validation_period:
29 |     epochs: 1
30 | data:
31 |     use_bind_mount: True
32 |     bind_mount_path: /data
33 |     use_cache: True
34 | entrypoint: model_def:WordLanguageModelTrial
35 | bind_mounts:
36 |     - host_path: /tmp
37 |       container_path: /data
38 |       read_only: false


--------------------------------------------------------------------------------
/deepspeed/cifar10_cpu_offloading/ds_config_no_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 128,
 3 |   "steps_per_print": 10,
 4 |   "optimizer": {
 5 |     "type": "Adam",
 6 |     "params": {
 7 |       "lr": 0.001,
 8 |       "betas": [
 9 |         0.8,
10 |         0.999
11 |       ],
12 |       "eps": 1e-8,
13 |       "weight_decay": 3e-7
14 |     }
15 |   },
16 |     "scheduler": {
17 |     "type": "WarmupLR",
18 |     "params": {
19 |       "warmup_min_lr": 0,
20 |       "warmup_max_lr": 0.001,
21 |       "warmup_num_steps": 1000
22 |     }
23 |   },
24 |   "zero_optimization": {
25 |     "stage": 0,
26 |     "allgather_partitions": true,
27 |     "allgather_bucket_size": 5e8,
28 |     "overlap_comm": true,
29 |     "reduce_scatter": true,
30 |     "reduce_bucket_size": 5e8,
31 |     "contiguous_gradients": true
32 |   },
33 |   "gradient_clipping": 1.0,
34 |   "fp16": {
35 |     "enabled": true,
36 |     "loss_scale": 0,
37 |     "initial_scale_power": 5,
38 |     "loss_scale_window": 1000,
39 |     "hysteresis": 2,
40 |     "min_loss_scale": 1
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/computer_vision/detectron2_coco_pytorch/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM determinedai/environments:cuda-10.2-base-gpu-0.20.1
 2 | 
 3 | RUN pip install tensorboard cmake onnx   # cmake from apt-get is too old
 4 | RUN pip install torch==1.10 torchvision==0.11.1 -f https://download.pytorch.org/whl/cu101/torch_stable.html
 5 | 
 6 | RUN pip install 'git+https://github.com/facebookresearch/fvcore'
 7 | # install detectron2
 8 | RUN git clone https://github.com/facebookresearch/detectron2 detectron2_repo
 9 | # set FORCE_CUDA because during `docker build` cuda is not accessible
10 | ENV FORCE_CUDA="1"
11 | # This will by default build detectron2 for all common cuda architectures and take a lot more time,
12 | # because inside `docker build`, there is no way to tell which architecture will be used.
13 | #ARG TORCH_CUDA_ARCH_LIST="Kepler;Kepler+Tesla;Maxwell;Maxwell+Tegra;Pascal;Volta;Turing"
14 | ARG TORCH_CUDA_ARCH_LIST="Kepler;Kepler+Tesla"
15 | ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}"
16 | 
17 | RUN pip install -e detectron2_repo
18 | 
19 | RUN pip install horovod==0.24.2
20 | 
21 | 


--------------------------------------------------------------------------------
/gan/dcgan_tf_keras/data.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | def get_train_dataset(worker_rank: int):
 5 |     (train_images, _), (_, _) = tf.keras.datasets.mnist.load_data(path=f"mnist-{worker_rank}.npz")
 6 | 
 7 |     train_images = train_images.reshape(train_images.shape[0], 28, 28, 1).astype("float32")
 8 |     train_images = (train_images - 127.5) / 127.5  # Normalize the images to [-1, 1]
 9 | 
10 |     # Batch and shuffle the data
11 |     train_dataset = tf.data.Dataset.from_tensor_slices(train_images).shuffle(50000)
12 |     return train_dataset
13 | 
14 | 
15 | def get_validation_dataset(worker_rank: int):
16 |     (_, _), (test_images, _) = tf.keras.datasets.mnist.load_data(path=f"mnist-{worker_rank}.npz")
17 | 
18 |     test_images = test_images.reshape(test_images.shape[0], 28, 28, 1).astype("float32")
19 |     test_images = (test_images - 127.5) / 127.5  # Normalize the images to [-1, 1]
20 | 
21 |     # Batch and shuffle the data
22 |     train_dataset = tf.data.Dataset.from_tensor_slices(test_images).shuffle(50000)
23 |     return train_dataset
24 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning-2/deepspeed.yaml:
--------------------------------------------------------------------------------
 1 | name: mistral deepspeed easy
 2 | debug: false
 3 | environment:
 4 |   environment_variables:
 5 |     - NCCL_DEBUG=INFO
 6 |   image: determinedai/environments:cuda-11.8-pytorch-2.0-gpu-95c7a14
 7 | resources:
 8 |   slots_per_trial: 2
 9 | searcher:
10 |   name: single
11 |   max_length:
12 |     batches: 5000
13 |   metric: eval_accuracy
14 |   smaller_is_better: false
15 | hyperparameters:
16 |   model: "mistralai/Mistral-7B-Instruct-v0.2"
17 |   dataset_subset: "easy"
18 |   lora: false
19 |   training_args:
20 |     output_dir: "/tmp/llm_finetuning"
21 |     max_steps: 5000
22 |     per_device_train_batch_size: 2
23 |     per_device_eval_batch_size: 4
24 |     bf16: true
25 |     evaluation_strategy: "steps"
26 |     eval_steps: 1000
27 |     logging_strategy: "steps"
28 |     logging_steps: 100
29 |     save_strategy: "steps"
30 |     save_steps: 5000
31 |     learning_rate: 1e-5
32 |     deepspeed: "ds_configs/ds_config_stage_3.json"
33 | entrypoint: >-
34 |   python -m determined.launch.deepspeed
35 |   python finetune.py
36 | max_restarts: 0


--------------------------------------------------------------------------------
/gan/gan_mnist_pytorch/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch MNIST GAN Example
 2 | 
 3 | This example demonstrates how to build a simple GAN on the MNIST dataset using
 4 | Determined's PyTorch API. This example is adapted from this [PyTorch Lightning GAN
 5 | example](https://github.com/Lightning-AI/pytorch-lightning/blob/master/examples/pytorch/domain_templates/generative_adversarial_net.py).
 6 | 
 7 | ## Files
 8 | * **model_def.py**: The core code for the model. This includes building and compiling the model.
 9 | * **data.py**: The data loading and preparation code for the model.
10 | 
11 | ### Configuration Files
12 | * **const.yaml**: Train the model with constant hyperparameter values.
13 | * **distributed.yaml**: Same as const.yaml, but instead uses multiple GPUs (distributed training).
14 | 
15 | ## To Run
16 | Installation instructions can be found under `docs/install-admin.html` or at [Determined installation page](https://docs.determined.ai/latest/index.html).
17 | After configuring the settings in `const.yaml`, run the following command: `det -m <master host:port> experiment create -f const.yaml . `
18 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning-3/ds_configs/ds_config_stage_2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "optimizer": {
11 |     "type": "AdamW",
12 |     "params": {
13 |       "lr": "auto",
14 |       "betas": "auto",
15 |       "eps": "auto",
16 |       "weight_decay": "auto"
17 |     }
18 |   },
19 |   "zero_optimization": {
20 |     "stage": 2,
21 |     "allgather_partitions": true,
22 |     "allgather_bucket_size": 2e8,
23 |     "overlap_comm": true,
24 |     "reduce_scatter": true,
25 |     "reduce_bucket_size": 2e8,
26 |     "contiguous_gradients": true
27 |   },
28 |   "gradient_accumulation_steps": "auto",
29 |   "gradient_clipping": "auto",
30 |   "train_batch_size": "auto",
31 |   "train_micro_batch_size_per_gpu": "auto",
32 |   "flops_profiler": {
33 |     "enabled": false,
34 |     "profile_step": 1,
35 |     "module_depth": -1,
36 |     "top_modules": 1,
37 |     "detailed": true,
38 |     "output_file": null
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning-2/lora.yaml:
--------------------------------------------------------------------------------
 1 | name: mistral lora easy
 2 | debug: false
 3 | environment:
 4 |   environment_variables:
 5 |     - NCCL_DEBUG=INFO
 6 |   image: 
 7 |     gpu: determinedai/environments:cuda-11.8-pytorch-2.0-gpu-95c7a14
 8 |     cpu: determinedai/environments:py-3.10-pytorch-2.0-cpu-03ae7d7
 9 | resources:
10 |   slots_per_trial: 2
11 | searcher:
12 |   name: single
13 |   max_length:
14 |     batches: 5000
15 |   metric: eval_accuracy
16 |   smaller_is_better: false
17 | hyperparameters:
18 |   model: "mistralai/Mistral-7B-Instruct-v0.2"
19 |   dataset_subset: "easy"
20 |   lora: true
21 |   training_args:
22 |     output_dir: "/tmp/llm_finetuning"
23 |     max_steps: 5000
24 |     per_device_train_batch_size: 8
25 |     per_device_eval_batch_size: 4
26 |     bf16: true
27 |     evaluation_strategy: "steps"
28 |     eval_steps: 1000
29 |     logging_strategy: "steps"
30 |     logging_steps: 100
31 |     save_strategy: "steps"
32 |     save_steps: 1000
33 |     learning_rate: 1e-5
34 | entrypoint: >-
35 |   python -m determined.launch.torch_distributed
36 |   python finetune.py
37 | max_restarts: 0


--------------------------------------------------------------------------------
/deepspeed/deepspeed_dcgan/mnist_grad_accum.yaml:
--------------------------------------------------------------------------------
 1 | name: dcgan_deepspeed_mnist_grad_accum
 2 | data:
 3 |   dataroot: /data
 4 |   dataset: mnist
 5 |   image_size: 64
 6 | hyperparameters:
 7 |   deepspeed_config: ds_config.json
 8 |   noise_length: 100
 9 |   generator_width_base: 64
10 |   discriminator_width_base: 64
11 |   data_workers: 16
12 |   overwrite_deepspeed_args:
13 |     gradient_accumulation_steps: 4
14 | environment:
15 |   environment_variables:
16 |     - NCCL_DEBUG=INFO
17 |     # You may need to modify this to match your network configuration.
18 |     - NCCL_SOCKET_IFNAME=ens,eth,ib
19 |   image:
20 |     gpu: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.22.1
21 | bind_mounts:
22 |   - host_path: /tmp
23 |     container_path: /data
24 | resources:
25 |   slots_per_trial: 2
26 | searcher:
27 |   name: single
28 |   metric: no_validation_metric
29 |   max_length:
30 |     batches: 100000
31 | min_validation_period:
32 |   batches: 0
33 | entrypoint:
34 |   - python3
35 |   - -m
36 |   - determined.launch.deepspeed
37 |   - --trial
38 |   - model_def:DCGANTrial
39 | max_restarts: 0
40 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning-3/ds_configs/ds_config_stage_3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "optimizer": {
14 |     "type": "AdamW",
15 |     "params": {
16 |       "lr": "auto",
17 |       "betas": "auto",
18 |       "eps": "auto",
19 |       "weight_decay": "auto"
20 |     }
21 |   },
22 |   "zero_optimization": {
23 |     "stage": 3,
24 |     "overlap_comm": true,
25 |     "contiguous_gradients": true,
26 |     "sub_group_size": 1e9,
27 |     "reduce_bucket_size": "auto",
28 |     "stage3_prefetch_bucket_size": "auto",
29 |     "stage3_param_persistence_threshold": "auto",
30 |     "stage3_max_live_parameters": 1e9,
31 |     "stage3_max_reuse_distance": 1e9,
32 |     "stage3_gather_16bit_weights_on_model_save": true
33 |   },
34 |   "gradient_accumulation_steps": "auto",
35 |   "gradient_clipping": "auto",
36 |   "train_batch_size": "auto",
37 |   "train_micro_batch_size_per_gpu": "auto"
38 | }
39 | 


--------------------------------------------------------------------------------
/deepspeed/deepspeed_dcgan/cifar10_zero2.yaml:
--------------------------------------------------------------------------------
 1 | name: dcgan_deepspeed_cifar10
 2 | data:
 3 |   dataroot: /data
 4 |   dataset: cifar10
 5 |   image_size: 64
 6 | hyperparameters:
 7 |   deepspeed_config: ds_config.json
 8 |   noise_length: 100
 9 |   generator_width_base: 64
10 |   discriminator_width_base: 64
11 |   data_workers: 16
12 |   overwrite_deepspeed_args:
13 |     zero_optimization.stage: 2
14 |     fp16.enabled: true
15 | environment:
16 |   environment_variables:
17 |     - NCCL_DEBUG=INFO
18 |     # You may need to modify this to match your network configuration.
19 |     - NCCL_SOCKET_IFNAME=ens,eth,ib
20 |   image:
21 |     gpu: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.22.1
22 | bind_mounts:
23 |   - host_path: /tmp
24 |     container_path: /data
25 | resources:
26 |   slots_per_trial: 2
27 | searcher:
28 |   name: single
29 |   metric: no_validation_metric
30 |   max_length:
31 |     batches: 100000
32 | min_validation_period:
33 |   batches: 0
34 | entrypoint:
35 |   - python3
36 |   - -m
37 |   - determined.launch.deepspeed
38 |   - --trial
39 |   - model_def:DCGANTrial
40 | max_restarts: 0
41 | 


--------------------------------------------------------------------------------
/deepspeed/cifar10_cpu_offloading/zero_3_cpu_offload.yaml:
--------------------------------------------------------------------------------
 1 | name: No OOM error
 2 | debug: false
 3 | #profiling:
 4 | #    enabled: true
 5 | #    begin_on_batch: 1
 6 | #    end_after_batch: 10
 7 | #    sync_timings: false
 8 | hyperparameters:
 9 |   deepspeed_config: ds_config_offload.json
10 |   deepspeed_offload: true
11 | environment:
12 |   environment_variables:
13 |     - NCCL_DEBUG=INFO
14 |     # You may need to modify this to match your network configuration.
15 |     - NCCL_SOCKET_IFNAME=ens,eth,ib
16 |   image:
17 |     gpu: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.22.1
18 | bind_mounts:
19 |   - host_path: /tmp
20 |     container_path: /data
21 |   - host_path: /tmp
22 |     container_path: /root/.cache
23 | resources:
24 |   slots_per_trial: 2
25 | records_per_epoch: 5000
26 | searcher:
27 |   name: single
28 |   metric: accuracy
29 |   smaller_is_better: false
30 |   max_length:
31 |     epochs: 1
32 | entrypoint:
33 |   - python3
34 |   - -m
35 |   - determined.launch.deepspeed
36 |   - --trial
37 |   - model_def:CIFARTrial
38 | checkpoint_policy: none
39 | max_restarts: 0
40 | scheduling_unit: 2000
41 | 


--------------------------------------------------------------------------------
/deepspeed/cifar10_cpu_offloading/zero_no_offload.yaml:
--------------------------------------------------------------------------------
 1 | name: OOM error
 2 | debug: false
 3 | #profiling:
 4 | #    enabled: true
 5 | #    begin_on_batch: 1
 6 | #    end_after_batch: 1000
 7 | #    sync_timings: false
 8 | hyperparameters:
 9 |   deepspeed_config: ds_config_no_offload.json
10 |   deepspeed_offload: false
11 | environment:
12 |   environment_variables:
13 |     - NCCL_DEBUG=INFO
14 |     # You may need to modify this to match your network configuration.
15 |     - NCCL_SOCKET_IFNAME=ens,eth,ib
16 |   image:
17 |     gpu: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.22.1
18 | bind_mounts:
19 |   - host_path: /tmp
20 |     container_path: /data
21 |   - host_path: /tmp
22 |     container_path: /root/.cache
23 | resources:
24 |   slots_per_trial: 2
25 | records_per_epoch: 5000
26 | searcher:
27 |   name: single
28 |   metric: accuracy
29 |   smaller_is_better: false
30 |   max_length:
31 |     epochs: 1
32 | entrypoint:
33 |   - python3
34 |   - -m
35 |   - determined.launch.deepspeed
36 |   - --trial
37 |   - model_def:CIFARTrial
38 | checkpoint_policy: none
39 | max_restarts: 0
40 | scheduling_unit: 2000
41 | 


--------------------------------------------------------------------------------
/gan/pix2pix_tf_keras/adaptive.yaml:
--------------------------------------------------------------------------------
 1 | name: pix2pix_facades_adaptive_asha
 2 | data:
 3 |   base: http://efrosgans.eecs.berkeley.edu/pix2pix/datasets
 4 |   dataset: facades
 5 |   BUFFER_SIZE: 400
 6 |   height: 256
 7 |   width: 256
 8 | hyperparameters:
 9 |   global_batch_size: 1
10 |   discriminator_lr:
11 |     type: log
12 |     base: 10
13 |     minval: -5
14 |     maxval: -4
15 |   discriminator_beta_1:
16 |     type: log
17 |     base: 10
18 |     minval: -1
19 |     maxval: 0
20 |   generator_lr: 
21 |     type: log
22 |     base: 10
23 |     minval: -5
24 |     maxval: -4
25 |   generator_beta_1: 
26 |     type: log
27 |     base: 10
28 |     minval: -1
29 |     maxval: 0
30 |   jitter:
31 |     type: int
32 |     minval: 0
33 |     maxval: 30
34 |   mirror: true
35 | records_per_epoch: 400  # There are 400 images in the facades training set
36 | min_validation_period:
37 |   batches: 40
38 | min_checkpoint_period:
39 |   batches: 400
40 | searcher:
41 |   name: adaptive_asha
42 |   metric: val_total_loss
43 |   smaller_is_better: true
44 |   max_length:
45 |     batches: 4000
46 |   max_trials: 50
47 | entrypoint: model_def:Pix2PixTrial
48 | 


--------------------------------------------------------------------------------
/blog/tp/test_dot_product_local.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Demonstrating the equivalence of a basic dot product with intermediate activation function and a
 3 | sharded-version of the same calculation.
 4 | """
 5 | 
 6 | import torch
 7 | 
 8 | D_MODEL = 128
 9 | RANKS = 4
10 | 
11 | if __name__ == "__main__":
12 |     a = torch.randn(D_MODEL)
13 |     b = torch.randn(D_MODEL)
14 | 
15 |     act_fn = torch.nn.GELU()
16 |     # The dot-product, different ways
17 |     dot_0 = a @ act_fn(b)
18 |     dot_1 = (a * act_fn(b)).sum()
19 |     dot_2 = torch.einsum("i, i", a, act_fn(b))
20 | 
21 |     a_sharded = a.reshape(RANKS, D_MODEL // RANKS)
22 |     b_sharded = b.reshape(RANKS, D_MODEL // RANKS)
23 | 
24 |     # More equivalent dot-products, using the sharded tensors.
25 |     dot_3 = (a_sharded * act_fn(b_sharded)).sum()
26 |     dot_4 = (a_sharded @ act_fn(b_sharded).T).trace()
27 |     dot_5 = (a_sharded.T @ act_fn(b_sharded)).trace()
28 |     dot_6 = torch.einsum("ij, ij", a_sharded, act_fn(b_sharded))
29 | 
30 |     for dot_prod in (dot_1, dot_2, dot_3, dot_4, dot_5, dot_6):
31 |         torch.testing.assert_close(dot_0, dot_prod)
32 |     print("Correct results")
33 | 


--------------------------------------------------------------------------------
/computer_vision/byol_pytorch/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Callable, Dict, TypeVar
 2 | 
 3 | import torch.nn as nn
 4 | 
 5 | A = TypeVar("A")
 6 | B = TypeVar("B")
 7 | 
 8 | 
 9 | def merge_dicts(d1: Dict[A, B], d2: Dict[A, B], f: Callable[[B, B], B]) -> Dict[A, B]:
10 |     """
11 |     Merges dictionaries with a custom merge function.
12 |     E.g. if k in d1 and k in d2, result[k] == f(d1[k], d2[k]).
13 |     Otherwise, if e.g. k is in only d1, result[k] == d1[k]
14 |     """
15 |     d1_keys = d1.keys()
16 |     d2_keys = d2.keys()
17 |     shared = d1_keys & d2_keys
18 |     d1_exclusive = d1_keys - d2_keys
19 |     d2_exclusive = d2_keys - d1_keys
20 |     new_dict = {k: f(d1[k], d2[k]) for k in shared}
21 |     new_dict.update({k: d1[k] for k in d1_exclusive})
22 |     new_dict.update({k: d2[k] for k in d2_exclusive})
23 |     return new_dict
24 | 
25 | 
26 | class LambdaModule(nn.Module):
27 |     """
28 |     Wrap a lambda as an nn.Module.
29 |     """
30 | 
31 |     def __init__(self, lam: Callable) -> None:
32 |         super().__init__()
33 |         self.lam = lam
34 | 
35 |     def forward(self, x: Any) -> Any:
36 |         return self.lam(x)
37 | 


--------------------------------------------------------------------------------
/computer_vision/cifar10_tf_keras/adaptive.yaml:
--------------------------------------------------------------------------------
 1 | name: cifar10_tf_keras_adaptive_search
 2 | data:
 3 |   url: https://s3-us-west-2.amazonaws.com/determined-ai-datasets/cifar10/cifar-10-python.tar.gz
 4 | hyperparameters:
 5 |   learning_rate:
 6 |     type: log
 7 |     minval: -5.0
 8 |     maxval: 1.0
 9 |     base: 10.0
10 |   learning_rate_decay: 1.0e-6
11 |   layer1_dropout:
12 |     type: double
13 |     minval: 0.2
14 |     maxval: 0.5
15 |   layer2_dropout:
16 |     type: double
17 |     minval: 0.2
18 |     maxval: 0.5
19 |   layer3_dropout:
20 |     type: double
21 |     minval: 0.2
22 |     maxval: 0.5
23 |   global_batch_size:
24 |     type: int
25 |     minval: 16
26 |     maxval: 64
27 |   width_shift_range:
28 |     type: double
29 |     minval: 0.0
30 |     maxval: 0.2
31 |   height_shift_range:
32 |     type: double
33 |     minval: 0.0
34 |     maxval: 0.2
35 |   horizontal_flip:
36 |     type: categorical
37 |     vals:
38 |       - True
39 |       - False
40 | records_per_epoch: 50000
41 | searcher:
42 |   name: adaptive_asha
43 |   mode: aggressive
44 |   metric: val_categorical_error
45 |   max_length:
46 |     epochs: 32
47 |   max_trials: 16
48 | entrypoint: model_def:CIFARTrial
49 | 


--------------------------------------------------------------------------------
/deepspeed/cifar10_moe/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 16,
 3 |   "steps_per_print": 2000,
 4 |   "optimizer": {
 5 |     "type": "Adam",
 6 |     "params": {
 7 |       "lr": 0.001,
 8 |       "betas": [
 9 |         0.8,
10 |         0.999
11 |       ],
12 |       "eps": 1e-8,
13 |       "weight_decay": 3e-7
14 |     }
15 |   },
16 |   "scheduler": {
17 |     "type": "WarmupLR",
18 |     "params": {
19 |       "warmup_min_lr": 0,
20 |       "warmup_max_lr": 0.001,
21 |       "warmup_num_steps": 1000
22 |     }
23 |   },
24 |   "gradient_clipping": 1.0,
25 |   "prescale_gradients": false,
26 |   "fp16": {
27 |       "enabled": true,
28 |       "fp16_master_weights_and_grads": false,
29 |       "loss_scale": 0,
30 |       "loss_scale_window": 500,
31 |       "hysteresis": 2,
32 |       "min_loss_scale": 1,
33 |       "initial_scale_power": 15
34 |   },
35 |   "wall_clock_breakdown": false,
36 |   "zero_optimization": {
37 |       "stage": 0,
38 |       "allgather_partitions": true,
39 |       "reduce_scatter": true,
40 |       "allgather_bucket_size": 50000000,
41 |       "reduce_bucket_size": 50000000,
42 |       "overlap_comm": true,
43 |       "contiguous_gradients": true,
44 |       "cpu_offload": false
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/computer_vision/byol_pytorch/evaluate_result.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | 
 3 | from determined.experimental import client
 4 | 
 5 | if __name__ == "__main__":
 6 |     parser = ArgumentParser(
 7 |         description="Start an evaluation run (w/ classifier training) from the top checkpoint of a given experiment."
 8 |     )
 9 |     parser.add_argument("--experiment-id", type=int, required=True)
10 |     parser.add_argument("--classifier-train-epochs", type=int, default=80)
11 |     args = parser.parse_args()
12 |     exp = client.get_experiment(args.experiment_id)
13 |     config = dict(exp.get_config())
14 |     print(sorted(list(config.keys())))
15 |     config["name"] = config["name"] + "_evaluation"
16 |     config["min_validation_period"] = {"epochs": args.classifier_train_epochs}
17 |     config["searcher"]["max_length"]["epochs"] = args.classifier_train_epochs
18 |     config["hyperparameters"]["training_mode"] = "CLASSIFIER_ONLY"
19 |     config["hyperparameters"]["validate_with_classifier"] = True
20 |     config["searcher"]["source_checkpoint_uuid"] = exp.top_checkpoint().uuid
21 |     config["searcher"]["metric"] = "test_accuracy"
22 |     config["searcher"]["smaller_is_better"] = False
23 |     client.create_experiment(config, ".")
24 | 


--------------------------------------------------------------------------------
/gan/cyclegan/startup-hook.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | apt install unzip
 4 | 
 5 | FILE=monet2photo
 6 | TMP_DIR=/tmp
 7 | 
 8 | if [[ $FILE != "ae_photos" && $FILE != "apple2orange" && $FILE != "summer2winter_yosemite" &&  $FILE != "horse2zebra" && $FILE != "monet2photo" && $FILE != "cezanne2photo" && $FILE != "ukiyoe2photo" && $FILE != "vangogh2photo" && $FILE != "maps" && $FILE != "cityscapes" && $FILE != "facades" && $FILE != "iphone2dslr_flower" && $FILE != "ae_photos" ]]; then
 9 |     echo "Available datasets are: apple2orange, summer2winter_yosemite, horse2zebra, monet2photo, cezanne2photo, ukiyoe2photo, vangogh2photo, maps, cityscapes, facades, iphone2dslr_flower, ae_photos"
10 |     exit 1
11 | fi
12 | 
13 | URL=https://people.eecs.berkeley.edu/~taesung_park/CycleGAN/datasets/$FILE.zip
14 | ZIP_FILE=$TMP_DIR/$FILE.zip
15 | TARGET_DIR=$TMP_DIR/$FILE
16 | wget --no-verbose -N $URL -O $ZIP_FILE
17 | unzip -q $ZIP_FILE -d $TMP_DIR
18 | rm $ZIP_FILE
19 | 
20 | # Adapt to project expected directory heriarchy
21 | mkdir -p "$TARGET_DIR/train" "$TARGET_DIR/test"
22 | mv "$TARGET_DIR/trainA" "$TARGET_DIR/train/A"
23 | mv "$TARGET_DIR/trainB" "$TARGET_DIR/train/B"
24 | mv "$TARGET_DIR/testA" "$TARGET_DIR/test/A"
25 | mv "$TARGET_DIR/testB" "$TARGET_DIR/test/B"
26 | 


--------------------------------------------------------------------------------
/deepspeed/cifar10_moe/moe.yaml:
--------------------------------------------------------------------------------
 1 | name: cifar10_moe_deepspeed
 2 | debug: false
 3 | hyperparameters:
 4 |   deepspeed_config: ds_config.json
 5 |   moe: true
 6 |   num_experts:
 7 |     - 2
 8 |   ep_world_size: 2
 9 |   mlp_type: standard
10 |   top_k: 1
11 |   min_capacity: 0
12 |   noisy_gate_policy: RSample
13 |   moe_param_group: true
14 | 
15 | environment:
16 |   environment_variables:
17 |     - NCCL_DEBUG=INFO
18 |     # You may need to modify this to match your network configuration.
19 |     - NCCL_SOCKET_IFNAME=ens,eth,ib
20 |   #    - CUDA_LAUNCH_BLOCKING=1
21 |   #    - NCCL_BLOCKING_WAIT=1
22 |   #    - NCCL_IB_DISABLE=1
23 |   image:
24 |     gpu: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.22.1
25 | bind_mounts:
26 |   - host_path: /tmp
27 |     container_path: /data
28 |   - host_path: /tmp
29 |     container_path: /root/.cache
30 | resources:
31 |   slots_per_trial: 2
32 | records_per_epoch: 50000
33 | searcher:
34 |   name: single
35 |   metric: accuracy
36 |   smaller_is_better: false
37 |   max_length:
38 |     epochs: 2
39 | entrypoint:
40 |   - python3
41 |   - -m
42 |   - determined.launch.deepspeed
43 |   - --trial
44 |   - model_def:CIFARTrial
45 | checkpoint_policy: none
46 | max_restarts: 0
47 | scheduling_unit: 2000
48 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning/chat_format.py:
--------------------------------------------------------------------------------
 1 | CHAT_ML_TEMPLATE = """
 2 | {% for message in messages %}
 3 | {% if message['role'] == 'user' %}
 4 | {{'<|im_start|>user\n' + message['content'].strip() + '<|im_end|>' }}
 5 | {% elif message['role'] == 'system' %}
 6 | {{'<|im_start|>system\n' + message['content'].strip() + '<|im_end|>' }}
 7 | {% elif message['role'] == 'assistant' %}
 8 | {{'<|im_start|>assistant\n'  + message['content'] + '<|im_end|>' }}
 9 | {% endif %}
10 | {% endfor %}
11 | """
12 | 
13 | ASSISTANT_PROMPT = "<|im_start|>assistant\n"
14 | 
15 | EOS_TOKEN = "<|im_end|>"
16 | 
17 | 
18 | def get_chat_format(element):
19 |     system_prompt = (
20 |         "You are a helpful programmer assistant that excels at SQL. "
21 |         "When prompted with a task and a definition of an SQL table, you "
22 |         "respond with a SQL query to retrieve information from the table. "
23 |         "Don't explain your reasoning, only provide the SQL query."
24 |     )
25 |     user_prompt = "Task: {instruction}\nSQL table: {input}\nSQL query: "
26 | 
27 |     return [
28 |         {"role": "system", "content": system_prompt},
29 |         {"role": "user", "content": user_prompt.format_map(element)},
30 |         {"role": "assistant", "content": element["response"]},
31 |     ]
32 | 


--------------------------------------------------------------------------------
/model_hub/mmdetection/hydra/mmdet_experiment.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import hydra
 4 | from omegaconf import DictConfig, MissingMandatoryValue, OmegaConf
 5 | 
 6 | from determined.common.experimental import Determined
 7 | 
 8 | CONTEXT_DIR = os.getcwd()
 9 | 
10 | 
11 | def check_for_missing(cfg):
12 |     if isinstance(cfg, dict):
13 |         for k, item in cfg.items():
14 |             if item == "???":
15 |                 raise MissingMandatoryValue(f"Missing mandatory value for {k}.")
16 |             check_for_missing(item)
17 |     elif isinstance(cfg, list):
18 |         for item in cfg:
19 |             check_for_missing(item)
20 | 
21 | 
22 | @hydra.main(config_path="./configs", config_name="config")
23 | def my_experiment(cfg: DictConfig) -> None:
24 |     config = OmegaConf.to_container(cfg, resolve=True)
25 |     # We use a helper function now to check for missing values.
26 |     # In the next version of omegaconf, we will be able to check for missing values by
27 |     # passing throw_on_missing to the OmegaConf.to_container call above.
28 |     check_for_missing(config)
29 | 
30 |     master = Determined()
31 |     exp = master.create_experiment(config, CONTEXT_DIR)
32 |     exp.activate()
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     my_experiment()
37 | 


--------------------------------------------------------------------------------
/features/custom_reducers_mnist_pytorch/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch Custom Reducers (MNIST)
 2 | This tutorial shows how to use custom reducers with PyTorch.  In this example,
 3 | the custom reducer is a per-class F1 score.
 4 | 
 5 | This example is based on Determined's `mnist_pytorch` tutorial, with the custom
 6 | reducer as the only modification.
 7 | 
 8 | ## Files
 9 | * **model_def.py**: Where the custom reducer is defined and used.
10 | * All other files are identical to the `mnist_pytorch` tutorial code.
11 | 
12 | ## To Run
13 | If you have not yet installed Determined, installation instructions can be found
14 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html
15 | 
16 | Run the following command: `det -m <master host:port> experiment create -f
17 | const.yaml .`. The other configurations can be run by specifying the appropriate
18 | configuration file in place of `const.yaml`.
19 | 
20 | ## Results
21 | You should see the per-class F1 scores in the Determined WebUI and while
22 | viewing the tensorboard results for the experiment.  The remaining metrics
23 | should match the behvaior of the `mnist_pytorch` tutorial.
24 | 
25 | The custom reducers should work whether you run a single-slot experiment or a
26 | multi-slot experiment with distributed training.
27 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning-2/ds_configs/ds_config_stage_1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "optimizer": {
11 |     "type": "AdamW",
12 |     "params": {
13 |       "lr": "auto",
14 |       "betas": "auto",
15 |       "eps": "auto",
16 |       "weight_decay": "auto"
17 |     }
18 |   },
19 |   "scheduler": {
20 |     "type": "WarmupLR",
21 |     "params": {
22 |       "warmup_min_lr": "auto",
23 |       "warmup_max_lr": "auto",
24 |       "warmup_num_steps": "auto"
25 |     }
26 |   },
27 |   "zero_optimization": {
28 |     "stage": 1,
29 |     "allgather_partitions": true,
30 |     "allgather_bucket_size": 2e8,
31 |     "overlap_comm": true,
32 |     "reduce_scatter": true,
33 |     "reduce_bucket_size": 2e8,
34 |     "contiguous_gradients": true
35 |   },
36 |   "gradient_accumulation_steps": "auto",
37 |   "gradient_clipping": "auto",
38 |   "train_batch_size": "auto",
39 |   "train_micro_batch_size_per_gpu": "auto",
40 |   "flops_profiler": {
41 |     "enabled": true,
42 |     "profile_step": 1,
43 |     "module_depth": -1,
44 |     "top_modules": 1,
45 |     "detailed": true,
46 |     "output_file": null
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning-2/ds_configs/ds_config_stage_2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "optimizer": {
11 |     "type": "AdamW",
12 |     "params": {
13 |       "lr": "auto",
14 |       "betas": "auto",
15 |       "eps": "auto",
16 |       "weight_decay": "auto"
17 |     }
18 |   },
19 |   "scheduler": {
20 |     "type": "WarmupLR",
21 |     "params": {
22 |       "warmup_min_lr": "auto",
23 |       "warmup_max_lr": "auto",
24 |       "warmup_num_steps": "auto"
25 |     }
26 |   },
27 |   "zero_optimization": {
28 |     "stage": 2,
29 |     "allgather_partitions": true,
30 |     "allgather_bucket_size": 2e8,
31 |     "overlap_comm": true,
32 |     "reduce_scatter": true,
33 |     "reduce_bucket_size": 2e8,
34 |     "contiguous_gradients": true
35 |   },
36 |   "gradient_accumulation_steps": "auto",
37 |   "gradient_clipping": "auto",
38 |   "train_batch_size": "auto",
39 |   "train_micro_batch_size_per_gpu": "auto",
40 |   "flops_profiler": {
41 |     "enabled": true,
42 |     "profile_step": 1,
43 |     "module_depth": -1,
44 |     "top_modules": 1,
45 |     "detailed": true,
46 |     "output_file": null
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning-3/ds_configs/ds_config_stage_1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "optimizer": {
11 |     "type": "AdamW",
12 |     "params": {
13 |       "lr": "auto",
14 |       "betas": "auto",
15 |       "eps": "auto",
16 |       "weight_decay": "auto"
17 |     }
18 |   },
19 |   "scheduler": {
20 |     "type": "WarmupLR",
21 |     "params": {
22 |       "warmup_min_lr": "auto",
23 |       "warmup_max_lr": "auto",
24 |       "warmup_num_steps": "auto"
25 |     }
26 |   },
27 |   "zero_optimization": {
28 |     "stage": 1,
29 |     "allgather_partitions": true,
30 |     "allgather_bucket_size": 2e8,
31 |     "overlap_comm": true,
32 |     "reduce_scatter": true,
33 |     "reduce_bucket_size": 2e8,
34 |     "contiguous_gradients": true
35 |   },
36 |   "gradient_accumulation_steps": "auto",
37 |   "gradient_clipping": "auto",
38 |   "train_batch_size": "auto",
39 |   "train_micro_batch_size_per_gpu": "auto",
40 |   "flops_profiler": {
41 |     "enabled": true,
42 |     "profile_step": 1,
43 |     "module_depth": -1,
44 |     "top_modules": 1,
45 |     "detailed": true,
46 |     "output_file": null
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/blog/lora-parameters/README.md:
--------------------------------------------------------------------------------
 1 | # Finding the best LoRA parameters
 2 | 
 3 | We finetune [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) using [LoRA](https://arxiv.org/abs/2106.09685) and [DeepSpeed](https://github.com/microsoft/DeepSpeed). We ran LoRA on two 40 GB A100 GPUs utilizing DeepSpeed.  
 4 | 
 5 | See our [blog post](https://www.determined.ai/blog/lora-parameters) for our experiment results.
 6 | 
 7 | To get started, first install Determined on your local machine:
 8 | ```bash
 9 | pip install determined
10 | ```
11 | 
12 | Then finetune with LoRA:
13 | ```bash
14 | det e create lora.yaml . 
15 | ```
16 | 
17 | You can view the actual training code in `finetune.py`.
18 | 
19 | 
20 | ## Configuration
21 | 
22 | Change configuration options in `lora.yaml`. Some important options are:
23 | - `slots_per_trial`: the number of GPUs to use.
24 | - `dataset_subset`: the difficulty subset to train on.
25 | - `per_device_train_batch_size`: the batch size per GPU.
26 | 
27 | 
28 | DeepSpeed configuration files are in the `ds_configs` folder.
29 | 
30 | 
31 | ## Contributors
32 | 
33 | - By [Sze Wai Yuen](https://github.com/szewaiyuen6)
34 | - Built on `llm-finetuning` code by [Agnieszka Ciborowska](https://github.com/aciborowska) and [Kevin Musgrave](https://github.com/KevinMusgrave).


--------------------------------------------------------------------------------
/nlp/albert_squad_pytorch/distributed_8gpu.yaml:
--------------------------------------------------------------------------------
 1 | # After 2 epochs, model should hit 85.76/88.87 F1/EM
 2 | name: ALBert_SQuAD_PyTorch_8gpu
 3 | hyperparameters:
 4 |     global_batch_size: 16
 5 |     learning_rate: 5.0e-5
 6 |     model_type: 'albert'
 7 |     do_lower_case: true
 8 |     adam_epsilon: 1.0e-8
 9 |     weight_decay: 0
10 |     num_warmup_steps: 1620
11 |     max_seq_length: 384
12 |     doc_stride: 128
13 |     max_query_length: 64
14 |     n_best_size: 20
15 |     max_answer_length: 30
16 |     null_score_diff_threshold: 0.0
17 |     max_grad_norm: 1.0
18 |     num_training_steps: 16500 # This is the number of optimizer steps. Train for 2 epochs
19 |     use_radam: false
20 | resources:
21 |     slots_per_trial: 8
22 | searcher:
23 |     name: single
24 |     metric: f1
25 |     max_length:
26 |         records: 264396
27 |     smaller_is_better: false
28 | min_validation_period:
29 |     records: 80000
30 | data:
31 |     pretrained_model_name: "albert-xxlarge-v2"
32 |     use_bind_mount: True
33 |     bind_mount_path: /mnt/data
34 |     task: "SQuAD2.0"  # SQuaD 2.0 has 132198 example.
35 | entrypoint: model_def:AlbertSQuADPyTorch
36 | optimizations:
37 |     aggregation_frequency: 3
38 | bind_mounts:
39 |     - host_path: /tmp/
40 |       container_path: /mnt/data
41 |       read_only: false
42 | 


--------------------------------------------------------------------------------
/nlp/albert_squad_pytorch/distributed_64gpu.yaml:
--------------------------------------------------------------------------------
 1 | # After 2 epochs, model should hit 86.24/89.06 F1/EM
 2 | name: ALBert_SQuAD_PyTorch_64gpu
 3 | hyperparameters:
 4 |     global_batch_size: 128
 5 |     learning_rate: 0.0002
 6 |     model_type: 'albert'
 7 |     do_lower_case: true
 8 |     adam_epsilon: 1.0e-8
 9 |     weight_decay: 0
10 |     num_warmup_steps: 206
11 |     max_seq_length: 384
12 |     doc_stride: 128
13 |     max_query_length: 64
14 |     n_best_size: 20
15 |     max_answer_length: 30
16 |     null_score_diff_threshold: 0.0
17 |     max_grad_norm: 1.0
18 |     num_training_steps: 2064 # This is the number of optimizer steps. Train for 2 epochs
19 |     use_radam: true
20 | resources:
21 |     slots_per_trial: 64
22 | searcher:
23 |     name: single
24 |     metric: f1
25 |     max_length:
26 |         records: 264396
27 |     smaller_is_better: false
28 | min_validation_period:
29 |     records: 100000
30 | data:
31 |     pretrained_model_name: "albert-xxlarge-v2"
32 |     use_bind_mount: True
33 |     bind_mount_path: /mnt/data
34 |     task: "SQuAD2.0"  # SQuaD 2.0 has 132198 example.
35 | entrypoint: model_def:AlbertSQuADPyTorch
36 | optimizations:
37 |     aggregation_frequency: 2
38 | bind_mounts:
39 |     - host_path: /tmp/
40 |       container_path: /mnt/data
41 |       read_only: false
42 | 


--------------------------------------------------------------------------------
/meta_learning/protonet_omniglot_pytorch/20way1shot.yaml:
--------------------------------------------------------------------------------
 1 | name: omniglot_protonet 
 2 | data:
 3 |     data_path: ./data
 4 |     validation_portion: 0.25
 5 |     tasks_per_epoch_train: 100
 6 |     tasks_per_epoch_val: 1000
 7 |     train_workers: 8
 8 |     val_workers: 4
 9 | 
10 | hyperparameters:
11 |   learning_rate: 1.0e-3
12 |   weight_decay: 0
13 |   reduce_every: 200
14 |   lr_gamma: 0.5
15 |   global_batch_size: 2 # how many tasks to train before performing a meta-update
16 |   val_batch_size: 2 # how many tasks to evaluate on
17 |   # Meta-training
18 |   num_classes_train: 60
19 |   num_support_train: 1
20 |   num_query_train: 5
21 |   # Meta-test
22 |   num_classes_val: 20 #n-way
23 |   num_support_val: 1 #k-shot
24 |   # Model
25 |   img_resize_dim: 28 # input will be 1 x img_resize_dim x img_resize_dim
26 |   hidden_dim: 64 # intermediate number of channels
27 |   embedding_dim: 64 # embedding number of channels
28 | 
29 | resources:
30 |   slots_per_trial: 2
31 | 
32 | searcher:
33 |   name: single
34 |   metric: loss
35 |   smaller_is_better: true
36 |   # Original paper trained for 10,000 epochs with a plateau stopping condition
37 |   max_length:
38 |     batches: 30000
39 | 
40 | entrypoint: model_def:OmniglotProtoNetTrial
41 | min_validation_period:
42 |   batches: 5000
43 | checkpoint_policy: none
44 | 


--------------------------------------------------------------------------------
/meta_learning/protonet_omniglot_pytorch/20way5shot.yaml:
--------------------------------------------------------------------------------
 1 | name: omniglot_protonet 
 2 | data:
 3 |     data_path: ./data
 4 |     validation_portion: 0.25
 5 |     tasks_per_epoch_train: 100
 6 |     tasks_per_epoch_val: 1000
 7 |     train_workers: 8
 8 |     val_workers: 4
 9 | 
10 | hyperparameters:
11 |   learning_rate: 1.0e-3
12 |   weight_decay: 0
13 |   reduce_every: 200
14 |   lr_gamma: 0.5
15 |   global_batch_size: 2 # how many tasks to train before performing a meta-update
16 |   val_batch_size: 2 # how many tasks to evaluate on
17 |   # Meta-training
18 |   num_classes_train: 60
19 |   num_support_train: 5
20 |   num_query_train: 5
21 |   # Meta-test
22 |   num_classes_val: 20 #n-way
23 |   num_support_val: 5 #k-shot
24 |   # Model
25 |   img_resize_dim: 28 # input will be 1 x img_resize_dim x img_resize_dim
26 |   hidden_dim: 64 # intermediate number of channels
27 |   embedding_dim: 64 # embedding number of channels
28 | 
29 | resources:
30 |   slots_per_trial: 2
31 | 
32 | searcher:
33 |   name: single
34 |   metric: loss
35 |   smaller_is_better: true
36 |   # Original paper trained for 10,000 epochs with a plateau stopping condition
37 |   max_length:
38 |     batches: 30000
39 | 
40 | entrypoint: model_def:OmniglotProtoNetTrial
41 | min_validation_period:
42 |   batches: 5000
43 | checkpoint_policy: none
44 | 


--------------------------------------------------------------------------------
/nlp/albert_squad_pytorch/const.yaml:
--------------------------------------------------------------------------------
 1 | # After 2 epochs, model should hit 85.76/88.87 F1/EM
 2 | name: ALBert_SQuAD_PyTorch_1gpu
 3 | hyperparameters:
 4 |     global_batch_size: 2
 5 |     learning_rate: 5.0e-5
 6 |     model_type: 'albert'
 7 |     adam_epsilon: 1.0e-8
 8 |     weight_decay: 0
 9 |     num_warmup_steps: 13220  # 10% of total training
10 |     max_seq_length: 384
11 |     doc_stride: 128
12 |     max_query_length: 64
13 |     n_best_size: 20
14 |     max_answer_length: 30
15 |     null_score_diff_threshold: 0.0
16 |     max_grad_norm: 1.0
17 |     num_training_steps: 132198 # This is the number of optimizer steps. Train for 2 epochs
18 |     do_lower_case: true
19 |     use_radam: false
20 | resources:
21 |     slots_per_trial: 1
22 | searcher:
23 |     name: single
24 |     metric: f1
25 |     max_length:
26 |         records: 264396
27 |     smaller_is_better: false
28 | min_validation_period:
29 |     records: 80000
30 | data:
31 |     pretrained_model_name: "albert-xxlarge-v2"
32 |     use_bind_mount: True
33 |     bind_mount_path: /mnt/data
34 |     task: "SQuAD2.0"  # SQuaD 2.0 has 132198 example.
35 | entrypoint: model_def:AlbertSQuADPyTorch
36 | optimizations:
37 |     aggregation_frequency: 24
38 | bind_mounts:
39 |     - host_path: /tmp/
40 |       container_path: /mnt/data
41 |       read_only: false
42 | 


--------------------------------------------------------------------------------
/deepspeed/pipeline_parallelism/README.md:
--------------------------------------------------------------------------------
 1 | # DeepSpeed CIFAR Example
 2 | This example is adapted from the 
 3 | [pipeline parallelism example in the DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/training/pipeline_parallelism) 
 4 | repository. It is intended to demonstrate a simple usecase of DeepSpeed's PipelineEngine with Determined.
 5 | 
 6 | ## Files
 7 | * **model_def.py**: The core code for the model. This includes building and compiling the model.
 8 | * **alexnet.py**: Specifies the AlexNet architecture.
 9 | 
10 | ### Configuration Files
11 | * **ds_config.json**: The DeepSpeed config file.
12 | * **distributed.yaml**: Determined config to train the model with 2-stage pipeline parallelism.
13 | 
14 | ## Data
15 | The CIFAR-10 dataset is downloaded from https://www.cs.toronto.edu/~kriz/cifar.html.
16 | 
17 | ## To Run
18 | If you have not yet installed Determined, installation instructions can be found
19 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html
20 | 
21 | Run the following command: 
22 | ```
23 | det experiment create distributed.yaml .
24 | ```
25 | 
26 | ## Results
27 | Training the model with the hyperparameter settings in `distributed.yaml` on 2 
28 | NVidia Tesla V100s on a single node should yield a throughput of at least 800 samples/sec.  
29 | 


--------------------------------------------------------------------------------
/deepspeed/cifar10_moe/zero_stages.yaml:
--------------------------------------------------------------------------------
 1 | name: cifar10_zero_deepspeed
 2 | debug: false
 3 | hyperparameters:
 4 |   deepspeed_config: ds_config.json
 5 |   moe: false
 6 |   num_experts:
 7 |     - 2
 8 |   ep_world_size: 2
 9 |   mlp_type: standard
10 |   top_k: 1
11 |   min_capacity: 0
12 |   noisy_gate_policy: RSample
13 |   moe_param_group: true
14 |   overwrite_deepspeed_args:
15 |     zero_optimization.stage: 2
16 | environment:
17 |   environment_variables:
18 |     - NCCL_DEBUG=INFO
19 |     # You may need to modify this to match your network configuration.
20 |     - NCCL_SOCKET_IFNAME=ens,eth,ib
21 |   #    - CUDA_LAUNCH_BLOCKING=1
22 |   #    - NCCL_BLOCKING_WAIT=1
23 |   #    - NCCL_IB_DISABLE=1
24 |   image:
25 |     gpu: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-0.22.1
26 | bind_mounts:
27 |   - host_path: /tmp
28 |     container_path: /data
29 |   - host_path: /tmp
30 |     container_path: /root/.cache
31 | resources:
32 |   slots_per_trial: 2
33 | records_per_epoch: 50000
34 | searcher:
35 |   name: single
36 |   metric: accuracy
37 |   smaller_is_better: false
38 |   max_length:
39 |     epochs: 2
40 | entrypoint:
41 |   - python3
42 |   - -m
43 |   - determined.launch.deepspeed
44 |   - --trial
45 |   - model_def:CIFARTrial
46 | checkpoint_policy: none
47 | max_restarts: 0
48 | scheduling_unit: 2000
49 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning-2/ds_configs/ds_config_stage_3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "optimizer": {
14 |     "type": "AdamW",
15 |     "params": {
16 |       "lr": "auto",
17 |       "betas": "auto",
18 |       "eps": "auto",
19 |       "weight_decay": "auto"
20 |     }
21 |   },
22 |   "scheduler": {
23 |     "type": "WarmupDecayLR",
24 |     "params": {
25 |       "warmup_min_lr": "auto",
26 |       "warmup_max_lr": "auto",
27 |       "warmup_num_steps": "auto",
28 |       "total_num_steps": "auto"
29 |     }
30 |   },
31 |   "zero_optimization": {
32 |     "stage": 3,
33 |     "overlap_comm": true,
34 |     "contiguous_gradients": true,
35 |     "sub_group_size": 1e9,
36 |     "reduce_bucket_size": "auto",
37 |     "stage3_prefetch_bucket_size": "auto",
38 |     "stage3_param_persistence_threshold": "auto",
39 |     "stage3_max_live_parameters": 1e9,
40 |     "stage3_max_reuse_distance": 1e9,
41 |     "stage3_gather_16bit_weights_on_model_save": true
42 |   },
43 |   "gradient_accumulation_steps": "auto",
44 |   "gradient_clipping": "auto",
45 |   "train_batch_size": "auto",
46 |   "train_micro_batch_size_per_gpu": "auto"
47 | }
48 | 


--------------------------------------------------------------------------------
/deepspeed/cifar10_moe/README.md:
--------------------------------------------------------------------------------
 1 | # DeepSpeed CIFAR Example
 2 | This example is adapted from the 
 3 | [CIFAR example in the DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples/tree/master/training/cifar) 
 4 | repository. It is intended to demonstrate a simple usecase of DeepSpeed with Determined.
 5 | 
 6 | ## Files
 7 | * **model_def.py**: The core code for the model. This includes building and compiling the model.
 8 | 
 9 | ### Configuration Files
10 | * **ds_config.json**: The DeepSpeed config file.
11 | * **moe.yaml**: Determined config to train the model with Mixture of Experts enabled.
12 | * **zero_stages.yaml**: Same as `moe.yaml`, but trains the model with ZeRO stage 2 optimizer.
13 | 
14 | ## Data
15 | The CIFAR-10 dataset is downloaded from https://www.cs.toronto.edu/~kriz/cifar.html.
16 | 
17 | ## To Run
18 | If you have not yet installed Determined, installation instructions can be found
19 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html
20 | 
21 | Run the following command: 
22 | ```
23 | det experiment create moe.yaml .
24 | ``` 
25 | The other configuration can be run by specifying the appropriate configuration file in place 
26 | of `moe.yaml`.
27 | 
28 | ## Results
29 | Training the model with the hyperparameter settings in `moe.yaml` should yield
30 | a validation accuracy of ~45% after 2 epochs.
31 | 


--------------------------------------------------------------------------------
/gan/pix2pix_tf_keras/pix2pix/sampling.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | def downsample(filters, size, apply_batchnorm=True):
 5 |     initializer = tf.random_normal_initializer(0.0, 0.02)
 6 | 
 7 |     result = tf.keras.Sequential()
 8 |     result.add(
 9 |         tf.keras.layers.Conv2D(
10 |             filters,
11 |             size,
12 |             strides=2,
13 |             padding="same",
14 |             kernel_initializer=initializer,
15 |             use_bias=False,
16 |         )
17 |     )
18 | 
19 |     if apply_batchnorm:
20 |         result.add(tf.keras.layers.BatchNormalization())
21 | 
22 |     result.add(tf.keras.layers.LeakyReLU())
23 | 
24 |     return result
25 | 
26 | 
27 | def upsample(filters, size, apply_dropout=False):
28 |     initializer = tf.random_normal_initializer(0.0, 0.02)
29 | 
30 |     result = tf.keras.Sequential()
31 |     result.add(
32 |         tf.keras.layers.Conv2DTranspose(
33 |             filters,
34 |             size,
35 |             strides=2,
36 |             padding="same",
37 |             kernel_initializer=initializer,
38 |             use_bias=False,
39 |         )
40 |     )
41 | 
42 |     result.add(tf.keras.layers.BatchNormalization())
43 | 
44 |     if apply_dropout:
45 |         result.add(tf.keras.layers.Dropout(0.5))
46 | 
47 |     result.add(tf.keras.layers.ReLU())
48 | 
49 |     return result
50 | 


--------------------------------------------------------------------------------
/custom_search_method/asha_search_method/local_search_runner/README.md:
--------------------------------------------------------------------------------
 1 | # Custom SearchMethod with LocalSearchRunner
 2 | 
 3 | In this example, we use LocalSearchRunner, which executes a custom SearchMethod on your local machine and
 4 | orchestrates a multi-trial experiment on a Determined cluster.
 5 | 
 6 | For an example of running the custom SearchMethod on a cluster,
 7 | see `examples/custom_search_method/asha_custom_search_method/remote_search_runner`.
 8 | 
 9 | ## Files
10 | * **run_experiment.py**: The code for running the custom SearchMethod locally with LocalSearchRunner.
11 | 
12 | ## To Run
13 | If you have not yet installed Determined, installation instructions can be found
14 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html
15 | 
16 | 1. Set the `DET_MASTER` environment variable, which is the network address of the Determined master.
17 | For instance, `export DET_MASTER=<master_host:port>`.
18 | 2. Run the following command in the `asha_search_method` directory to start LocalSearchRunner: `python local_search_runner/run_experiment.py`.
19 | 
20 | ## Result
21 | LocalSearchRunner executes the custom SearchMethod on your local machine,
22 | while the multi-trial experiment for hyperparameter search is started on a Determined cluster.
23 | LocalSearchRunner handles the communication between the custom SearchMethod and the multi-trial experiment.


--------------------------------------------------------------------------------
/deepspeed/pipeline_parallelism/distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: cifar10_pipeline_parallel_deepspeed
 2 | debug: false
 3 | hyperparameters:
 4 |   deepspeed_config: ds_config.json
 5 |   pipe_parallel_size: 2
 6 |   part: parameters
 7 |   overwrite_deepspeed_args:
 8 |     train_micro_batch_size_per_gpu: 8
 9 | bind_mounts:
10 |   - host_path: /tmp
11 |     container_path: /data
12 |   - host_path: /tmp
13 |     container_path: /root/.cache
14 | environment:
15 |     #force_pull_image: true
16 |     environment_variables:                                                                          
17 |         - NCCL_DEBUG=INFO                                                                           
18 |         # You may need to modify this to match your network configuration.                          
19 |         - NCCL_SOCKET_IFNAME=ens,eth,ib
20 |     #    - CUDA_LAUNCH_BLOCKING=1
21 |     #    - NCCL_BLOCKING_WAIT=1
22 |     image:
23 |       gpu: determinedai/environments:cuda-11.3-pytorch-1.10-deepspeed-0.8.3-gpu-2b7e2a1
24 | resources:
25 |   slots_per_trial: 2
26 | records_per_epoch: 50000
27 | searcher:
28 |   name: single
29 |   metric: loss
30 |   smaller_is_better: false
31 |   max_length:
32 |     batches: 1000
33 | entrypoint:
34 |   - python3
35 |   - -m
36 |   - determined.launch.deepspeed
37 |   - --trial
38 |   - model_def:CIFARTrial
39 | max_restarts: 0
40 | checkpoint_policy: none
41 | 


--------------------------------------------------------------------------------
/features/torch_batch_process_core_api_comparison/model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from typing import Any, Dict, Optional, Sequence, Union
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | from determined.experimental import client
 8 | 
 9 | # Constants about the data set.
10 | IMAGE_SIZE = 32
11 | NUM_CHANNELS = 3
12 | NUM_CLASSES = 10
13 | 
14 | TorchData = Union[Dict[str, torch.Tensor], Sequence[torch.Tensor], torch.Tensor]
15 | 
16 | 
17 | class Flatten(nn.Module):
18 |     def forward(self, *args: TorchData, **kwargs: Any) -> torch.Tensor:
19 |         assert len(args) == 1
20 |         x = args[0]
21 |         assert isinstance(x, torch.Tensor)
22 |         return x.contiguous().view(x.size(0), -1)
23 | 
24 | 
25 | def build_model():
26 |     model = nn.Sequential(
27 |         nn.Conv2d(NUM_CHANNELS, IMAGE_SIZE, kernel_size=(3, 3)),
28 |         nn.ReLU(),
29 |         nn.Conv2d(32, 32, kernel_size=(3, 3)),
30 |         nn.ReLU(),
31 |         nn.MaxPool2d((2, 2)),
32 |         nn.Dropout2d(0.25),
33 |         nn.Conv2d(32, 64, (3, 3), padding=1),
34 |         nn.ReLU(),
35 |         nn.Conv2d(64, 64, (3, 3)),
36 |         nn.ReLU(),
37 |         nn.MaxPool2d((2, 2)),
38 |         nn.Dropout2d(0.25),
39 |         Flatten(),
40 |         nn.Linear(2304, 512),
41 |         nn.ReLU(),
42 |         nn.Dropout2d(0.5),
43 |         nn.Linear(512, NUM_CLASSES),
44 |     )
45 |     return model
46 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning-2/ds_configs/ds_config_stage_2_cpu_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "optimizer": {
11 |     "type": "AdamW",
12 |     "params": {
13 |       "lr": "auto",
14 |       "betas": "auto",
15 |       "eps": "auto",
16 |       "weight_decay": "auto"
17 |     }
18 |   },
19 |   "scheduler": {
20 |     "type": "WarmupLR",
21 |     "params": {
22 |       "warmup_min_lr": "auto",
23 |       "warmup_max_lr": "auto",
24 |       "warmup_num_steps": "auto"
25 |     }
26 |   },
27 |   "zero_optimization": {
28 |     "stage": 2,
29 |     "offload_optimizer": {
30 |       "device": "cpu",
31 |       "pin_memory": true
32 |     },
33 |     "allgather_partitions": true,
34 |     "allgather_bucket_size": 2e8,
35 |     "overlap_comm": true,
36 |     "reduce_scatter": true,
37 |     "reduce_bucket_size": 2e8,
38 |     "contiguous_gradients": true
39 |   },
40 |   "gradient_accumulation_steps": "auto",
41 |   "gradient_clipping": "auto",
42 |   "train_batch_size": "auto",
43 |   "train_micro_batch_size_per_gpu": "auto",
44 |   "flops_profiler": {
45 |     "enabled": true,
46 |     "profile_step": 1,
47 |     "module_depth": -1,
48 |     "top_modules": 1,
49 |     "detailed": true,
50 |     "output_file": null
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning-3/ds_configs/ds_config_stage_2_cpu_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "optimizer": {
11 |     "type": "AdamW",
12 |     "params": {
13 |       "lr": "auto",
14 |       "betas": "auto",
15 |       "eps": "auto",
16 |       "weight_decay": "auto"
17 |     }
18 |   },
19 |   "scheduler": {
20 |     "type": "WarmupLR",
21 |     "params": {
22 |       "warmup_min_lr": "auto",
23 |       "warmup_max_lr": "auto",
24 |       "warmup_num_steps": "auto"
25 |     }
26 |   },
27 |   "zero_optimization": {
28 |     "stage": 2,
29 |     "offload_optimizer": {
30 |       "device": "cpu",
31 |       "pin_memory": true
32 |     },
33 |     "allgather_partitions": true,
34 |     "allgather_bucket_size": 2e8,
35 |     "overlap_comm": true,
36 |     "reduce_scatter": true,
37 |     "reduce_bucket_size": 2e8,
38 |     "contiguous_gradients": true
39 |   },
40 |   "gradient_accumulation_steps": "auto",
41 |   "gradient_clipping": "auto",
42 |   "train_batch_size": "auto",
43 |   "train_micro_batch_size_per_gpu": "auto",
44 |   "flops_profiler": {
45 |     "enabled": false,
46 |     "profile_step": 1,
47 |     "module_depth": -1,
48 |     "top_modules": 1,
49 |     "detailed": true,
50 |     "output_file": null
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning/README.md:
--------------------------------------------------------------------------------
 1 | # LLM Finetuning using HuggingFace + Determined
 2 | 
 3 | In this demo, we finetune the [TinyLlama-1.1B-Chat](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.4) on a [text-to-SQL dataset](https://huggingface.co/datasets/Clinton/Text-to-sql-v1). We ran this on two 80 GB A100 GPUs.
 4 | 
 5 | To get started, first install Determined on your local machine:
 6 | ```bash
 7 | pip install determined
 8 | ```
 9 | 
10 | Then finetune:
11 | ```bash
12 | det e create distributed.yaml . 
13 | ```
14 | 
15 | Change configuration options in `distributed.yaml`. Some important options are:
16 | - `slots_per_trial`: the number of GPUs to use.
17 | - `dataset_subset`: the difficulty subset to train on.
18 | - `per_device_train_batch_size`: the batch size per GPU.
19 | 
20 | 
21 | Test your model's generation capabilities:
22 | 
23 | ```bash
24 | python test_model.py --exp_id <exp_id> --dataset_subset <dataset_subset>
25 | ```
26 | 
27 | Where 
28 | - `<exp_id>` is the id of your finetuning experiment in the Determined UI.
29 | - `<dataset_subset>` is one of "easy", "medium", or "hard".
30 | 
31 | To test the pretrained model (not finetuned), leave out `--exp_id`. For example:
32 | 
33 | ```bash
34 | python test_model.py --dataset_subset easy
35 | ```
36 | 
37 | ## Contributors
38 | 
39 | - [Kevin Musgrave](https://github.com/KevinMusgrave)
40 | - [Agnieszka Ciborowska](https://github.com/aciborowska)


--------------------------------------------------------------------------------
/computer_vision/fasterrcnn_coco_pytorch/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch Faster R-CNN Example
 2 | 
 3 | This example shows how to build an object detection model on the Penn-Fudan 
 4 | Database using Determined's PyTorch API. This example is adapted from this [PyTorch 
 5 | Mask R-CNN tutorial](https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html)
 6 | 
 7 | ## Files
 8 | * **model_def.py**: The core code for the model. This includes building and compiling the model.
 9 | 
10 | ### Configuration Files
11 | * **const.yaml**: Train the model with constant hyperparameter values.
12 | * **adaptive.yaml**: Perform a hyperparameter search using Determined's state-of-the-art adaptive hyperparameter tuning algorithm.
13 | 
14 | ## Data
15 | The current implementation uses the pedestrian detection and segmentation 
16 | [Penn-Fudan Database](https://www.cis.upenn.edu/~jshi/ped_html/).
17 | 
18 | ## To Run
19 | If you have not yet installed Determined, installation instructions can be found
20 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html
21 | 
22 | Run the following command: `det -m <master host:port> experiment create -f 
23 | const.yaml .`. The other configurations can be run by specifying the appropriate 
24 | configuration file in place of `const.yaml`.
25 | 
26 | ## Results
27 | Training the model with the hyperparameter settings in `const.yaml` should yield
28 | an IOU of ~0.42.
29 | 


--------------------------------------------------------------------------------
/blog/act-mem-2/mlp_script.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Prints out the ratio of activation memory for the MLP layer when using ReLU vs GELU.
 3 | """
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | import act_mem
 9 | import layers
10 | 
11 | if __name__ == "__main__":
12 |     batch_size, seq_len, d_model = 2, 4096, 1024
13 |     dtype = torch.bfloat16
14 |     inputs = torch.randn(
15 |         batch_size,
16 |         seq_len,
17 |         d_model,
18 |         device="cuda",
19 |         requires_grad=True,
20 |         dtype=dtype,
21 |     )
22 | 
23 |     act_fn_dict = {"ReLU": nn.ReLU(), "GELU": nn.GELU()}
24 |     # Append outputs to a list to keep tensors alive
25 |     outputs = []
26 |     mem_bytes = []
27 | 
28 |     for name, act_fn in act_fn_dict.items():
29 |         mlp = layers.MLP(
30 |             d_model=d_model,
31 |             act_fn=act_fn,
32 |             device="cuda",
33 |             dtype=dtype,
34 |         )
35 |         with act_mem.AllocatedMemContext() as mem, act_mem.SavedTensorContext(
36 |             ignored_tensors=mlp.parameters()
37 |         ) as saved:
38 |             out = mlp(inputs)
39 |             outputs.append(out)
40 |         assert mem.delta["current"] == saved.saved_tensor_mem
41 |         print(f"{name} bytes: {saved.saved_tensor_mem}")
42 |         mem_bytes.append(saved.saved_tensor_mem)
43 | 
44 |     print(f"ReLU/GELU act mem ratio: {mem_bytes[0]/mem_bytes[1]}")
45 | 


--------------------------------------------------------------------------------
/blog/act-mem-2/block_script.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Prints out the ratio of activation memory for the a transformer Block when using ReLU vs GELU.
 3 | """
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | import act_mem
 9 | import layers
10 | 
11 | if __name__ == "__main__":
12 |     batch_size, seq_len, d_model, n_heads = 2, 4096, 1024, 2
13 |     dtype = torch.bfloat16
14 |     inputs = torch.randn(
15 |         batch_size,
16 |         seq_len,
17 |         d_model,
18 |         device="cuda",
19 |         requires_grad=True,
20 |         dtype=dtype,
21 |     )
22 | 
23 |     act_fn_dict = {"ReLU": nn.ReLU(), "GELU": nn.GELU()}
24 |     # Append outputs to a list to keep tensors alive
25 |     outputs = []
26 |     mem_bytes = []
27 | 
28 |     for name, act_fn in act_fn_dict.items():
29 |         block = layers.Block(
30 |             d_model=d_model,
31 |             act_fn=act_fn,
32 |             n_heads=n_heads,
33 |             device="cuda",
34 |             dtype=dtype,
35 |         )
36 |         with act_mem.AllocatedMemContext() as mem, act_mem.SavedTensorContext(
37 |             ignored_tensors=block.parameters()
38 |         ) as saved:
39 |             out = block(inputs)
40 |             outputs.append(out)
41 |         print(f"{name} block bytes: {saved.saved_tensor_mem}")
42 |         mem_bytes.append(saved.saved_tensor_mem)
43 | 
44 |     print(f"ReLU/GeLU block act mem ratio: {mem_bytes[0]/mem_bytes[1]}")
45 | 


--------------------------------------------------------------------------------
/features/hp_constraints_mnist_pytorch/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch HP Search Constraints (MNIST) 
 2 | This tutorial shows how to use Determined's HP Search Constraints with 
 3 | PyTorch. In this example, the constraints are defined in Lines 56-57 of 
 4 | the `__init__` function in `model_def.py` based on the model hyperparameters
 5 | via the `det.InvalidHP` exception API (see the `HP Search Constraints` topic 
 6 | guide under https://docs.determined.ai/latest/topic-guides/index.html 
 7 | 
 8 | Constraints can also be defined in `train_batch` and `evaluate_batch`, 
 9 | where an InvalidHP exception can be raised based on 
10 | training and validation metrics respectively.
11 | 
12 | This example is based on Determined's `mnist_pytorch` tutorial, with the
13 | addition of the HP search constraint as the only modification.
14 | 
15 | ## Files
16 | * **model_def.py**: Where the HP Search constraint is defined and used.
17 | * All other files are identical to the `mnist_pytorch` tutorial code. 
18 | 
19 | ## To Run
20 | If you have not yet installed Determined, installation instructions can be found
21 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html
22 | 
23 | Run the following command: `det -m <master host:port> experiment create -f 
24 | adaptive.yaml .`.
25 | 
26 | ## Results
27 | Training the model with the hyperparameter settings in `adaptive.yaml` should yield
28 | a validation accuracy of ~97%. 
29 | 


--------------------------------------------------------------------------------
/model_hub/huggingface/multiple-choice/swag_config.yaml:
--------------------------------------------------------------------------------
 1 | name: huggingface_swag_trial
 2 | hyperparameters:
 3 |   pretrained_model_name_or_path: roberta-base
 4 |   model_mode: multiple-choice
 5 |   use_pretrained_weights: true
 6 |   use_apex_amp: true
 7 |   cache_dir: null
 8 |   # Training Args
 9 |   global_batch_size: 64
10 |   learning_rate: 5.0e-5
11 |   adam_epsilon: 1.0e-8
12 |   weight_decay: 0
13 |   lr_scheduler_type: linear
14 |   num_warmup_steps: 0
15 | data:
16 |   dataset_name: swag
17 |   dataset_config_name: regular
18 |   train_file: null
19 |   validation_file: null
20 |   overwrite_cache: false
21 |   preprocessing_num_workers: null
22 |   max_seq_length: 128
23 |   pad_to_max_length: false
24 | # Number of records per epoch differs based on max_seq_length.
25 | records_per_epoch: 73546
26 | min_validation_period:
27 |   batches: 500
28 | searcher:
29 |   name: single
30 |   metric: accuracy
31 |   max_length:
32 |     epochs: 3
33 |   smaller_is_better: false
34 | environment:
35 |   image: 
36 |    gpu: determinedai/model-hub-transformers:0.26.2-dev0
37 | resources:
38 |   slots_per_trial: 2
39 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the
40 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance.
41 | bind_mounts:
42 |   - host_path: /tmp
43 |     container_path: /root/.cache
44 | entrypoint: swag_trial:SWAGTrial
45 | 


--------------------------------------------------------------------------------
/model_hub/huggingface/language-modeling/clm_config.yaml:
--------------------------------------------------------------------------------
 1 | name: huggingface_clm_trial
 2 | hyperparameters:
 3 |   pretrained_model_name_or_path: gpt2
 4 |   model_mode: causal-lm
 5 |   use_pretrained_weights: true
 6 |   use_apex_amp: false
 7 |   cache_dir: null
 8 |   # Training Args
 9 |   global_batch_size: 8
10 |   learning_rate: 5.0e-5
11 |   adam_epsilon: 1.0e-8
12 |   weight_decay: 0
13 |   lr_scheduler_type: linear
14 |   num_warmup_steps: 0
15 | data:
16 |   dataset_name: wikitext
17 |   dataset_config_name: wikitext-2-raw-v1
18 |   train_file: null
19 |   validation_file: null
20 |   max_seq_length: null
21 |   overwrite_cache: false
22 |   validation_split_percentage: 5
23 |   preprocessing_num_workers: null
24 | # Number of records per epoch differs based on max_seq_length.
25 | records_per_epoch: 2318
26 | min_validation_period:
27 |   batches: 500
28 | searcher:
29 |   name: single
30 |   metric: perplexity
31 |   max_length:
32 |     epochs: 3
33 |   smaller_is_better: true
34 | environment:
35 |   image: 
36 |     gpu: determinedai/model-hub-transformers:0.26.2-dev0
37 | resources:
38 |   slots_per_trial: 4
39 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the
40 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance.
41 | bind_mounts:
42 |   - host_path: /tmp
43 |     container_path: /root/.cache
44 | entrypoint: clm_trial:CLMTrial
45 | 


--------------------------------------------------------------------------------
/blog/lora-parameters/ds_configs/ds_config_stage_3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |       "enabled": "auto",
 4 |       "loss_scale": 0,
 5 |       "loss_scale_window": 1000,
 6 |       "initial_scale_power": 16,
 7 |       "hysteresis": 2,
 8 |       "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |       "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |       "type": "AdamW",
15 |       "params": {
16 |         "lr": "auto",
17 |         "betas": "auto",
18 |         "eps": "auto",
19 |         "weight_decay": "auto"
20 |       }
21 |     },
22 |     "scheduler": {
23 |       "type": "WarmupDecayLR",
24 |       "params": {
25 |         "warmup_min_lr": "auto",
26 |         "warmup_max_lr": "auto",
27 |         "warmup_num_steps": "auto",
28 |         "total_num_steps": "auto"
29 |       }
30 |     },
31 |     "zero_optimization": {
32 |       "stage": 3,
33 |       "overlap_comm": true,
34 |       "contiguous_gradients": true,
35 |       "sub_group_size": 1e9,
36 |       "reduce_bucket_size": "auto",
37 |       "stage3_prefetch_bucket_size": "auto",
38 |       "stage3_param_persistence_threshold": "auto",
39 |       "stage3_max_live_parameters": 1e9,
40 |       "stage3_max_reuse_distance": 1e9,
41 |       "stage3_gather_16bit_weights_on_model_save": true
42 |     },
43 |     "gradient_accumulation_steps": "auto",
44 |     "gradient_clipping": "auto",
45 |     "train_batch_size": "auto",
46 |     "train_micro_batch_size_per_gpu": "auto"
47 |   }


--------------------------------------------------------------------------------
/gan/gan_mnist_pytorch/data.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import shutil
 4 | import urllib.parse
 5 | from typing import Any, Dict
 6 | 
 7 | import requests
 8 | from torchvision import datasets, transforms
 9 | 
10 | 
11 | def get_dataset(data_dir: str, train: bool) -> Any:
12 |     return datasets.MNIST(
13 |         data_dir,
14 |         train=train,
15 |         transform=transforms.Compose(
16 |             [
17 |                 transforms.ToTensor(),
18 |                 transforms.Normalize((0.5,), (0.5,)),
19 |             ]
20 |         ),
21 |     )
22 | 
23 | 
24 | def download_dataset(download_directory: str, data_config: Dict[str, Any]) -> str:
25 |     url = data_config["url"]
26 |     url_path = urllib.parse.urlparse(url).path
27 |     basename = url_path.rsplit("/", 1)[1]
28 | 
29 |     download_directory = os.path.join(download_directory, "MNIST")
30 |     os.makedirs(download_directory, exist_ok=True)
31 |     filepath = os.path.join(download_directory, basename)
32 |     if not os.path.exists(filepath):
33 |         logging.info("Downloading {} to {}".format(url, filepath))
34 | 
35 |         r = requests.get(url, stream=True)
36 |         with open(filepath, "wb") as f:
37 |             for chunk in r.iter_content(chunk_size=8192):
38 |                 if chunk:
39 |                     f.write(chunk)
40 | 
41 |     shutil.unpack_archive(filepath, download_directory)
42 | 
43 |     return os.path.dirname(download_directory)
44 | 


--------------------------------------------------------------------------------
/computer_vision/deformabledetr_coco_pytorch/startup-hook.sh:
--------------------------------------------------------------------------------
 1 | apt-get update
 2 | apt-get install unzip
 3 | 
 4 | # Download COCO 2017 annotations
 5 | wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
 6 | unzip -o annotations_trainval2017.zip
 7 | mv annotations/instances_train2017.json /tmp
 8 | mv annotations/instances_val2017.json /tmp
 9 | 
10 | # Clone Deformable-DETR library from source.
11 | # Since it is not an installable pacakge, we will have to add this to system path to import functions from it.
12 | git clone https://github.com/fundamentalvision/Deformable-DETR ddetr
13 | cd ddetr && git reset --hard 11169a60c33333af00a4849f1808023eba96a931
14 | # Need to fix a bug in the original code that fails to handle torchvision version 0.10 correctly.
15 | # Deformable DETR has some changes from DETR that need additional handling.
16 | sed -i 's/float(torchvision\.__version__\[:3\]) < 0.5/int(torchvision\.__version__.split("\.")\[1\]) < 7/g' util/misc.py
17 | sed -i 's/float(torchvision\.__version__\[:3\]) < 0.7/int(torchvision\.__version__.split("\.")\[1\]) < 7/g' util/misc.py
18 | 
19 | pip install tqdm attrdict pycocotools cython scipy
20 | 
21 | # Build custom cuda ops
22 | cd models/ops
23 | sh ./make.sh
24 | cd ../../..
25 | 
26 | # Download pretrained model using link from https://github.com/fundamentalvision/Deformable-DETR
27 | pip install gdown
28 | gdown https://drive.google.com/uc?id=1nDWZWHuRwtwGden77NLM9JoWe-YisJnA -O model.ckpt
29 | 


--------------------------------------------------------------------------------
/model_hub/huggingface/token-classification/ner_config.yaml:
--------------------------------------------------------------------------------
 1 | name: huggingface_ner_trial
 2 | hyperparameters:
 3 |   pretrained_model_name_or_path: bert-base-uncased
 4 |   model_mode: token-classification
 5 |   finetuning_task: ner
 6 |   use_pretrained_weights: true
 7 |   use_apex_amp: false
 8 |   # Training Args
 9 |   global_batch_size: 8
10 |   learning_rate: 5.0e-5
11 |   adam_epsilon: 1.0e-8
12 |   weight_decay: 0
13 |   lr_scheduler_type: linear
14 |   num_warmup_steps: 0
15 | data:
16 |   dataset_name: conllpp
17 |   dataset_config_name: null
18 |   train_file: null
19 |   validation_file: null
20 |   preprocessing_num_workers: null
21 |   cache_dir: null
22 |   overwrite_cache: false
23 |   pad_to_max_length: false
24 |   label_all_tokens: false
25 | # Number of records per epoch differs based on max_seq_length.
26 | records_per_epoch: 14041
27 | min_validation_period:
28 |   batches: 500
29 | searcher:
30 |   name: single
31 |   metric: accuracy_score
32 |   max_length:
33 |     epochs: 3
34 |   smaller_is_better: false
35 | environment:
36 |   image: 
37 |     gpu: determinedai/model-hub-transformers:0.26.2-dev0
38 | resources:
39 |   slots_per_trial: 1
40 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the
41 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance.
42 | bind_mounts:
43 |   - host_path: /tmp
44 |     container_path: /root/.cache
45 | entrypoint: ner_trial:NERTrial
46 | 


--------------------------------------------------------------------------------
/model_hub/huggingface/text-classification/xnli_config.yaml:
--------------------------------------------------------------------------------
 1 | name: huggingface_xnli_trial
 2 | hyperparameters:
 3 |   pretrained_model_name_or_path: bert-base-multilingual-cased
 4 |   model_mode: sequence-classification
 5 |   finetuning_task: xnli
 6 |   use_apex_amp: false
 7 |   use_pretrained_weights: true
 8 |   do_lower_case: false
 9 |   # Training Args
10 |   global_batch_size: 32
11 |   learning_rate: 5.0e-5
12 |   adam_epsilon: 1.0e-8
13 |   weight_decay: 0
14 |   lr_scheduler_type: linear
15 |   num_warmup_steps: 0
16 | data:
17 |   dataset_name: xnli
18 |   language: de
19 |   train_language: en
20 |   max_seq_length: 128
21 |   train_file: null
22 |   validation_file: null
23 |   preprocessing_num_workers: null
24 |   cache_dir: null
25 |   overwrite_cache: false
26 |   pad_to_max_length: true
27 | # Number of records per epoch differs based on max_seq_length.
28 | records_per_epoch: 392702
29 | min_validation_period:
30 |   batches: 500
31 | searcher:
32 |   name: single
33 |   metric: accuracy
34 |   max_length:
35 |     epochs: 2
36 |   smaller_is_better: false
37 | environment:
38 |   image: 
39 |     gpu: determinedai/model-hub-transformers:0.26.2-dev0
40 | resources:
41 |   slots_per_trial: 2
42 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the
43 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance.
44 | bind_mounts:
45 |   - host_path: /tmp
46 |     container_path: /root/.cache
47 | entrypoint: xnli_trial:XNLITrial
48 | 


--------------------------------------------------------------------------------
/computer_vision/cifar10_tf_keras/README.md:
--------------------------------------------------------------------------------
 1 | # TensorFlow (tf.keras) CIFAR-10 CNN Example
 2 | 
 3 | This example shows how to build a simple CNN on the CIFAR-10 dataset using
 4 | Determined's tf.keras API. This example is adapted from this [Keras CNN
 5 |  example](https://github.com/keras-team/keras/blob/keras-2/examples/cifar10_cnn.py).
 6 | 
 7 | ## Files
 8 | * **model_def.py**: Organizes the model and data-loaders into the Determined TFKerasTrial API.
 9 | * **cifar_model.py**: The core code for the model. This includes building and compiling the model.
10 | 
11 | ### Configuration Files
12 | * **const.yaml**: Train the model with constant hyperparameter values. 
13 | * **distributed.yaml**: Same as `const.yaml`, but instead uses multiple GPUs.
14 | * **adaptive.yaml**: Perform a hyperparameter search using Determined's state-of-the-art adaptive hyperparameter tuning algorithm. 
15 | 
16 | ## Data:
17 | The current implementation uses CIFAR-10 data downloaded from AWS S3.
18 | 
19 | ## To Run:
20 | If you have not yet installed Determined, installation instructions can be found
21 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html
22 | 
23 | Run the following command: `det -m <master host:port> experiment create -f 
24 | const.yaml .`. The other configurations can be run by specifying the appropriate 
25 | configuration file in place of `const.yaml`.
26 | 
27 | ## Results:
28 | Training the model with the hyperparameter settings in `const.yaml` should yield
29 | a validation accuracy of ~74%.
30 | 


--------------------------------------------------------------------------------
/model_hub/huggingface/language-modeling/mlm_config.yaml:
--------------------------------------------------------------------------------
 1 | name: huggingface_mlm_trial
 2 | hyperparameters:
 3 |   pretrained_model_name_or_path: roberta-base
 4 |   model_mode: masked-lm
 5 |   use_pretrained_weights: true
 6 |   use_apex_amp: true
 7 |   cache_dir: null
 8 |   # Training Args
 9 |   global_batch_size: 8
10 |   learning_rate: 5.0e-5
11 |   adam_epsilon: 1.0e-8
12 |   weight_decay: 0
13 |   lr_scheduler_type: linear
14 |   num_warmup_steps: 0
15 | data:
16 |   dataset_name: wikitext
17 |   dataset_config_name: wikitext-2-raw-v1
18 |   train_file: null
19 |   validation_file: null
20 |   overwrite_cache: false
21 |   validation_split_percentage: 5
22 |   max_seq_length: null
23 |   preprocessing_num_workers: null
24 |   mlm_probability: 0.15
25 |   line_by_line: false
26 |   pad_to_max_length: false
27 | # Number of records per epoch differs based on max_seq_length.
28 | records_per_epoch: 4798
29 | min_validation_period:
30 |   batches: 500
31 | searcher:
32 |   name: single
33 |   metric: perplexity
34 |   max_length:
35 |     epochs: 3
36 |   smaller_is_better: true
37 | environment:
38 |   image: 
39 |     gpu: determinedai/model-hub-transformers:0.26.2-dev0
40 | resources:
41 |   slots_per_trial: 1
42 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the
43 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance.
44 | bind_mounts:
45 |   - host_path: /tmp
46 |     container_path: /root/.cache
47 | entrypoint: mlm_trial:MLMTrial
48 | 


--------------------------------------------------------------------------------
/gan/cyclegan/datasets.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import random
 3 | import os
 4 | 
 5 | from torch.utils.data import Dataset
 6 | from PIL import Image
 7 | import torchvision.transforms as transforms
 8 | 
 9 | 
10 | def to_rgb(image):
11 |     rgb_image = Image.new("RGB", image.size)
12 |     rgb_image.paste(image)
13 |     return rgb_image
14 | 
15 | 
16 | class ImageDataset(Dataset):
17 |     def __init__(self, root, transforms_=None, unaligned=False, mode="train"):
18 |         self.transform = transforms.Compose(transforms_)
19 |         self.unaligned = unaligned
20 | 
21 |         self.files_A = sorted(glob.glob(os.path.join(root, "%s/A" % mode) + "/*.*"))
22 |         self.files_B = sorted(glob.glob(os.path.join(root, "%s/B" % mode) + "/*.*"))
23 | 
24 |     def __getitem__(self, index):
25 |         image_A = Image.open(self.files_A[index % len(self.files_A)])
26 | 
27 |         if self.unaligned:
28 |             image_B = Image.open(self.files_B[random.randint(0, len(self.files_B) - 1)])
29 |         else:
30 |             image_B = Image.open(self.files_B[index % len(self.files_B)])
31 | 
32 |         # Convert grayscale images to rgb
33 |         if image_A.mode != "RGB":
34 |             image_A = to_rgb(image_A)
35 |         if image_B.mode != "RGB":
36 |             image_B = to_rgb(image_B)
37 | 
38 |         item_A = self.transform(image_A)
39 |         item_B = self.transform(image_B)
40 |         return {"A": item_A, "B": item_B}
41 | 
42 |     def __len__(self):
43 |         return max(len(self.files_A), len(self.files_B))
44 | 


--------------------------------------------------------------------------------
/computer_vision/efficientdet_pytorch/efficientdet_files/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | import numpy as np
 4 | from PIL import Image
 5 | from torch.utils.data import Dataset
 6 | 
 7 | 
 8 | class FakeParser:
 9 |     def __init__(self):
10 |         self.img_ids = []
11 | 
12 |     def create_fake_img_ids(self, num_indices):
13 |         self.img_ids = [np.random.randint(1, 90) for i in range(num_indices)]
14 | 
15 | 
16 | class FakeBackend(Dataset):
17 |     def __init__(self, transform=None):
18 |         self.transform = transform
19 | 
20 |     def __len__(self):
21 |         return 1000
22 | 
23 |     def __getitem__(self, i):
24 |         target = dict(img_idx=i, img_size=(512, 512))
25 | 
26 |         img = Image.open("loss_by_gpus.png").convert("RGB")
27 |         img = img.resize((512, 512))
28 | 
29 |         if self.transform is not None:
30 |             img, target = self.transform(img, target)
31 | 
32 |         target["bbox"] = np.random.rand(2, 4)
33 |         target["cls"] = np.array([np.random.randint(90), np.random.randint(90)])
34 | 
35 |         return img, target
36 | 
37 | 
38 | class DotDict(dict):
39 |     __setattr__ = dict.__setitem__
40 |     __delattr__ = dict.__delitem__
41 | 
42 |     def __init__(self, dct):
43 |         for key, value in dct.items():
44 |             if value == "None":
45 |                 value = None
46 |             self[key] = value
47 | 
48 |     def __getattr__(self, name):
49 |         try:
50 |             return self[name]
51 |         except:
52 |             return None
53 | 


--------------------------------------------------------------------------------
/computer_vision/detectron2_coco_pytorch/Base-RCNN-FPN.yaml:
--------------------------------------------------------------------------------
 1 | MODEL:
 2 |   META_ARCHITECTURE: "GeneralizedRCNN"
 3 |   BACKBONE:
 4 |     NAME: "build_resnet_fpn_backbone"
 5 |   RESNETS:
 6 |     OUT_FEATURES: ["res2", "res3", "res4", "res5"]
 7 |   FPN:
 8 |     IN_FEATURES: ["res2", "res3", "res4", "res5"]
 9 |   ANCHOR_GENERATOR:
10 |     SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
11 |     ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
12 |   RPN:
13 |     IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
14 |     PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
15 |     PRE_NMS_TOPK_TEST: 1000  # Per FPN level
16 |     # Detectron1 uses 2000 proposals per-batch,
17 |     # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue)
18 |     # which is approximately 1000 proposals per-image since the default batch size for FPN is 2.
19 |     POST_NMS_TOPK_TRAIN: 1000
20 |     POST_NMS_TOPK_TEST: 1000
21 |   ROI_HEADS:
22 |     NAME: "StandardROIHeads"
23 |     IN_FEATURES: ["p2", "p3", "p4", "p5"]
24 |   ROI_BOX_HEAD:
25 |     NAME: "FastRCNNConvFCHead"
26 |     NUM_FC: 2
27 |     POOLER_RESOLUTION: 7
28 |   ROI_MASK_HEAD:
29 |     NAME: "MaskRCNNConvUpsampleHead"
30 |     NUM_CONV: 4
31 |     POOLER_RESOLUTION: 14
32 | DATASETS:
33 |   TRAIN: ("coco_2017_train",)
34 |   TEST: ("coco_2017_val",)
35 | SOLVER:
36 |   IMS_PER_BATCH: 16
37 |   BASE_LR: 0.02
38 |   STEPS: (60000, 80000)
39 |   MAX_ITER: 90000
40 | INPUT:
41 |   MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800)
42 | VERSION: 2


--------------------------------------------------------------------------------
/nas/gaea_pytorch/search/data.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from torch.utils.data import Dataset
 3 | 
 4 | 
 5 | class BilevelDataset(Dataset):
 6 |     def __init__(
 7 |         self,
 8 |         dataset,
 9 |     ):
10 |         """
11 |         We will split the data into a train split and a validation split
12 |         and return one image from each split as a single observation.
13 | 
14 |         Args:
15 |             dataset: PyTorch Dataset object
16 |         """
17 |         inds = np.arange(len(dataset))
18 |         self.dataset = dataset
19 |         # Make sure train and val splits are of equal size.
20 |         # This is so we make sure to loop images in both train
21 |         # and val splits exactly once in an epoch.
22 |         n_train = int(0.5 * len(inds))
23 |         self.train_inds = inds[0:n_train]
24 |         self.val_inds = inds[n_train : 2 * n_train]
25 |         assert len(self.train_inds) == len(self.val_inds)
26 | 
27 |     def shuffle_val_inds(self):
28 |         # This is so we will see different pairs of images
29 |         # from train and val splits.  Will need to call this
30 |         # manually at epoch end.
31 |         np.random.shuffle(self.val_inds)
32 | 
33 |     def __len__(self):
34 |         return len(self.train_inds)
35 | 
36 |     def __getitem__(self, idx):
37 |         train_ind = self.train_inds[idx]
38 |         val_ind = self.val_inds[idx]
39 |         x_train, y_train = self.dataset[train_ind]
40 |         x_val, y_val = self.dataset[val_ind]
41 |         return x_train, y_train, x_val, y_val
42 | 


--------------------------------------------------------------------------------
/model_hub/huggingface/language-modeling/plm_config.yaml:
--------------------------------------------------------------------------------
 1 | name: huggingface_plm_trial
 2 | hyperparameters:
 3 |   pretrained_model_name_or_path: xlnet-base-cased
 4 |   model_mode: causal-lm
 5 |   use_pretrained_weights: true
 6 |   use_apex_amp: false
 7 |   cache_dir: null
 8 |   # Training Args
 9 |   global_batch_size: 2
10 |   learning_rate: 2.0e-5
11 |   adam_epsilon: 1.0e-8
12 |   weight_decay: 0
13 |   lr_scheduler_type: linear
14 |   num_warmup_steps: 0
15 | data:
16 |   dataset_name: wikitext
17 |   dataset_config_name: wikitext-2-raw-v1
18 |   train_file: null
19 |   validation_file: null
20 |   overwrite_cache: false
21 |   validation_split_percentage: 5
22 |   max_seq_length: 512
23 |   preprocessing_num_workers: null
24 |   plm_probability: 0.15
25 |   max_span_length: 5
26 |   line_by_line: false
27 |   pad_to_max_length: false
28 | # Number of records per epoch differs based on max_seq_length.
29 | records_per_epoch: 5334
30 | min_validation_period:
31 |   batches: 500
32 | searcher:
33 |   name: single
34 |   metric: perplexity
35 |   max_length:
36 |     epochs: 3
37 |   smaller_is_better: true
38 | environment:
39 |   image: 
40 |     gpu: determinedai/model-hub-transformers:0.26.2-dev0
41 | resources:
42 |   slots_per_trial: 2
43 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the
44 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance.
45 | bind_mounts:
46 |   - host_path: /tmp
47 |     container_path: /root/.cache
48 | entrypoint: plm_trial:PLMTrial
49 | 


--------------------------------------------------------------------------------
/hp_search_benchmarks/darts_penntreebank_pytorch/const.yaml:
--------------------------------------------------------------------------------
 1 | name: darts_rnn_nas
 2 | 
 3 | data:
 4 |   data_download_dir: /data
 5 | 
 6 | bind_mounts:
 7 |     - host_path: /tmp
 8 |       container_path: /data
 9 |       read_only: false
10 | 
11 | hyperparameters:
12 |   learning_rate: 20
13 |   global_batch_size: 64
14 |   # Epoch to start checking whether we should switch to
15 |   # ASGD instead of SGD.
16 |   optimizer_switch_epoch: 75
17 |   eval_batch_size: 10
18 |   emsize: 850
19 |   nhid: 850
20 |   nhidlast: 850
21 |   bptt: 35
22 |   dropout: 0.75
23 |   dropouth: 0.25
24 |   dropoutx: 0.75
25 |   dropouti: 0.2
26 |   dropoute: 0.1
27 |   nonmono: 5
28 |   alpha: 0
29 |   beta: 1.0e-3
30 |   weight_decay: 8.0e-7
31 |   max_seq_length_delta: 20
32 |   clip_gradients_l2_norm: 0.25
33 | 
34 | 
35 |   # Tunable hyperparameters
36 |   node1_edge: 0
37 |   node2_edge: 1
38 |   node3_edge: 1
39 |   node4_edge: 1
40 |   node5_edge: 2
41 |   node6_edge: 5
42 |   node7_edge: 3
43 |   node8_edge: 5
44 | 
45 |   node1_op: sigmoid
46 |   node2_op: relu
47 |   node3_op: relu
48 |   node4_op: identity
49 |   node5_op: tanh
50 |   node6_op: sigmoid
51 |   node7_op: tanh
52 |   node8_op: relu
53 | 
54 | resources:
55 |   slots_per_trial: 2
56 | 
57 | scheduling_unit: 100
58 | 
59 | min_validation_period: 
60 |   batches: 400
61 | 
62 | optimizations:
63 |   average_training_metrics: true
64 | 
65 | searcher:
66 |   name: single 
67 |   metric: loss
68 |   max_length: 
69 |     batches: 10000
70 |   smaller_is_better: true 
71 | 
72 | entrypoint: model_def:DARTSRNNTrial
73 | 


--------------------------------------------------------------------------------
/computer_vision/cifar10_pytorch/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch CIFAR-10 CNN Example
 2 | 
 3 | This example shows how to build a simple CNN on the CIFAR-10 dataset using
 4 | Determined's PyTorch API. This example is adapted from this [Keras CNN
 5 | example](https://github.com/keras-team/keras/blob/keras-2/examples/cifar10_cnn.py).
 6 | 
 7 | ## Files
 8 | * **model_def.py**: The core code for the model. This includes building and compiling the model.
 9 | 
10 | ### Configuration Files
11 | * **const.yaml**: Train the model with constant hyperparameter values.
12 | * **adaptive.yaml**: Perform a hyperparameter search using Determined's state-of-the-art adaptive hyperparameter tuning algorithm.
13 | * **distributed.yaml**: Same as `const.yaml`, but trains the model with multiple GPUs (distributed training).
14 | * **distributed_inference.yaml**: Use the distributed training workflow with PyTorchTrial to accelerate batch inference workloads.
15 | 
16 | ## Data
17 | The CIFAR-10 dataset is downloaded from https://www.cs.toronto.edu/~kriz/cifar.html.
18 | 
19 | ## To Run
20 | If you have not yet installed Determined, installation instructions can be found
21 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html
22 | 
23 | Run the following command: `det -m <master-host:port> experiment create -f
24 | const.yaml .`. The other configurations can be run by specifying the appropriate
25 | configuration file in place of `const.yaml`.
26 | 
27 | ## Results
28 | Training the model with the hyperparameter settings in `const.yaml` should yield a validation accuracy of ~74%.
29 | 


--------------------------------------------------------------------------------
/deepspeed/cifar10_cpu_offloading/ds_config_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 128,
 3 |   "steps_per_print": 10,
 4 |   "optimizer": {
 5 |     "type": "Adam",
 6 |     "params": {
 7 |       "lr": 0.001,
 8 |       "betas": [
 9 |         0.8,
10 |         0.999
11 |       ],
12 |       "eps": 1e-8,
13 |       "weight_decay": 3e-7
14 |     }
15 |   },
16 |   "scheduler": {
17 |     "type": "WarmupLR",
18 |     "params": {
19 |       "warmup_min_lr": 0,
20 |       "warmup_max_lr": 0.001,
21 |       "warmup_num_steps": 1000
22 |     }
23 |   },
24 |   "zero_optimization": {
25 |     "stage": 3,
26 |     "offload_optimizer": {
27 |       "device": "cpu",
28 |       "pin_memory": true,
29 |       "buffer_count": 4,
30 |       "fast_init": false
31 |     },
32 |     "offload_param": {
33 |       "device": "cpu",
34 |       "pin_memory": true,
35 |       "buffer_count": 5,
36 |       "buffer_size": 1e8,
37 |       "max_in_cpu": 1e9
38 |     },
39 |     "allgather_partitions": true,
40 |     "allgather_bucket_size": 5e8,
41 |     "overlap_comm": true,
42 |     "reduce_scatter": true,
43 |     "reduce_bucket_size": 5e8,
44 |     "contiguous_gradients": true,
45 |     "stage3_max_live_parameters": 1e9,
46 |     "stage3_max_reuse_distance": 1e9,
47 |     "stage3_prefetch_bucket_size": 5e8,
48 |     "stage3_param_persistence_threshold": 1e6
49 |   },
50 |   "gradient_clipping": 1.0,
51 |   "fp16": {
52 |     "enabled": true,
53 |     "loss_scale": 0,
54 |     "initial_scale_power": 5,
55 |     "loss_scale_window": 1000,
56 |     "hysteresis": 2,
57 |     "min_loss_scale": 1
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/tutorials/fashion_mnist_tf_keras/README.md:
--------------------------------------------------------------------------------
 1 | # TensorFlow (tf.keras) Fashion MNIST Tutorial
 2 | 
 3 | This tutorial shows how to build a simple CNN on the MNIST dataset using
 4 | Determined's tf.keras API. This example is adapted from this [Keras image
 5 | classification tutorial](https://www.tensorflow.org/tutorials/keras/classification).
 6 | 
 7 | ## Files
 8 | * **model_def.py**: The core code for the model. This includes building and compiling the model.
 9 | * **data.py**: The data loading and preparation code for the model.
10 | 
11 | ### Configuration Files
12 | * **const.yaml**: Train the model with constant hyperparameter values.
13 | * **distributed.yaml**: Same as `const.yaml`, but trains the model with multiple GPUs (distributed training).
14 | * **adaptive.yaml**: Perform a hyperparameter search using Determined's state-of-the-art adaptive hyperparameter tuning algorithm.
15 | 
16 | ## Data
17 | The current implementation downloads the Fashion MNIST data from 
18 | [here](https://github.com/zalandoresearch/fashion-mnist/blob/master/LICENSE).
19 | 
20 | ## To Run
21 | If you have not yet installed Determined, installation instructions can be found
22 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html
23 | 
24 | Run the following command: `det -m <master host:port> experiment create -f 
25 | const.yaml .`. The other configurations can be run by specifying the appropriate 
26 | configuration file in place of `const.yaml`.
27 | 
28 | ## Results
29 | Training the model with the hyperparameter settings in `const.yaml` should yield
30 | a validation accuracy of ~85%. 
31 | 


--------------------------------------------------------------------------------
/computer_vision/byol_pytorch/generate_blob_list.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | from typing import Any, List
 3 | 
 4 | from google.cloud import storage
 5 | 
 6 | 
 7 | def list_blobs(storage_client: Any, bucket_name: str, prefix: str = None) -> List:
 8 |     # Helper functions for GCP from https://cloud.google.com/storage/docs/listing-objects#code-samples
 9 |     """Lists all the blobs in the bucket."""
10 |     blobs = storage_client.list_blobs(bucket_name, prefix=prefix)
11 |     return blobs
12 | 
13 | 
14 | if __name__ == "__main__":
15 |     parser = ArgumentParser(
16 |         description="""Generate a listing of all blobs in a given GCS bucket/path for consumption by GCSImageFolder.
17 |         After running, upload the file to the GCS bucket and supply its path in data_config.gcs_train_blob_list_path or
18 |         data_config.gcs_validation_blob_list_path.
19 |         See distributed-imagenet.yaml for an example."""
20 |     )
21 |     parser.add_argument(
22 |         "--bucket-name",
23 |         type=str,
24 |         required=True,
25 |         help="Name of the GCS bucket, without gs:// prefix.",
26 |     )
27 |     parser.add_argument("--bucket-path", type=str, required=True, help="Path prefix.")
28 |     parser.add_argument("--output-file", type=str, required=True, help="File to output listing to.")
29 |     args = parser.parse_args()
30 |     storage_client = storage.Client()
31 |     blobs = list_blobs(storage_client, args.bucket_name, prefix=args.bucket_path)
32 |     with open(args.output_file, "w") as f:
33 |         for b in blobs:
34 |             f.write(b.name + "\n")
35 | 


--------------------------------------------------------------------------------
/hp_search_benchmarks/darts_penntreebank_pytorch/randomNAS_files/genotypes.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | 
 3 | Genotype = namedtuple("Genotype", "recurrent concat")
 4 | 
 5 | PRIMITIVES = ["none", "tanh", "relu", "sigmoid", "identity"]
 6 | STEPS = 8
 7 | CONCAT = 8
 8 | 
 9 | ENAS = Genotype(
10 |     recurrent=[
11 |         ("tanh", 0),
12 |         ("tanh", 1),
13 |         ("relu", 1),
14 |         ("tanh", 3),
15 |         ("tanh", 3),
16 |         ("relu", 3),
17 |         ("relu", 4),
18 |         ("relu", 7),
19 |         ("relu", 8),
20 |         ("relu", 8),
21 |         ("relu", 8),
22 |     ],
23 |     concat=[2, 5, 6, 9, 10, 11],
24 | )
25 | 
26 | DARTS_V1 = Genotype(
27 |     recurrent=[
28 |         ("relu", 0),
29 |         ("relu", 1),
30 |         ("tanh", 2),
31 |         ("relu", 3),
32 |         ("relu", 4),
33 |         ("identity", 1),
34 |         ("relu", 5),
35 |         ("relu", 1),
36 |     ],
37 |     concat=range(1, 9),
38 | )
39 | DARTS_V2 = Genotype(
40 |     recurrent=[
41 |         ("sigmoid", 0),
42 |         ("relu", 1),
43 |         ("relu", 1),
44 |         ("identity", 1),
45 |         ("tanh", 2),
46 |         ("sigmoid", 5),
47 |         ("tanh", 3),
48 |         ("relu", 5),
49 |     ],
50 |     concat=range(1, 9),
51 | )
52 | 
53 | DARTS = DARTS_V2
54 | 
55 | ASHA = Genotype(
56 |     recurrent=[
57 |         ("relu", 0),
58 |         ("relu", 0),
59 |         ("sigmoid", 0),
60 |         ("tanh", 0),
61 |         ("relu", 1),
62 |         ("tanh", 0),
63 |         ("identity", 5),
64 |         ("sigmoid", 0),
65 |     ],
66 |     concat=range(1, 9),
67 | )
68 | 


--------------------------------------------------------------------------------
/model_hub/huggingface/question-answering/squad.yaml:
--------------------------------------------------------------------------------
 1 | name: huggingface_squad
 2 | hyperparameters:
 3 |   pretrained_model_name_or_path: bert-base-uncased
 4 |   model_mode: question-answering
 5 |   use_pretrained_weights: true
 6 |   use_apex_amp: false
 7 |   cache_dir: null
 8 |   # Training Args
 9 |   global_batch_size: 12
10 |   learning_rate: 3.0e-5
11 |   adam_epsilon: 1.0e-8
12 |   weight_decay: 0
13 |   lr_scheduler_type: linear
14 |   num_warmup_steps: 0
15 | data:
16 |   dataset_name: squad
17 |   train_file: null
18 |   validation_file: null
19 |   overwrite_cache: false
20 |   preprocessing_num_workers: null
21 |   max_seq_length: 384
22 |   pad_to_max_length: true
23 |   version_2_with_negative: false
24 |   null_score_diff_threshold: 0
25 |   doc_stride: 128
26 |   n_best_size: 20
27 |   max_answer_length: 30
28 |   output_dir: /tmp
29 | # Number of records per epoch differs based on max_seq_length.
30 | records_per_epoch: 88524
31 | min_validation_period:
32 |   batches: 5000
33 | searcher:
34 |   name: single
35 |   metric: f1
36 |   max_length:
37 |     epochs: 2
38 |   smaller_is_better: false
39 | environment:
40 |   image: 
41 |     gpu: determinedai/model-hub-transformers:0.26.2-dev0
42 | resources:
43 |   slots_per_trial: 1
44 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the
45 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance.
46 | bind_mounts:
47 |   - host_path: /tmp
48 |     container_path: /root/.cache
49 |   - host_path: /tmp
50 |     container_path: /tmp
51 | entrypoint: qa_trial:QATrial
52 | 


--------------------------------------------------------------------------------
/features/checkpoint_hooks_pytorch/README.md:
--------------------------------------------------------------------------------
 1 | # PyTorch MNIST CNN Tutorial
 2 | This tutorial shows how to build a simple CNN on the MNIST dataset using
 3 | Determined's PyTorch API and showcases PyTorchTrial checkpoint callbacks.
 4 | This example is adapted from this [PyTorch MNIST
 5 | tutorial](https://github.com/pytorch/examples/tree/master/mnist).
 6 | 
 7 | ## Files
 8 | * **model_def.py**: The core code for the model. This includes building and compiling the model.
 9 | * **data.py**: The data loading and preparation code for the model.
10 | * **layers.py**: Defines the convolutional layers that the model uses. 
11 | 
12 | ### Configuration Files
13 | * **const.yaml**: Train the model with constant hyperparameter values.
14 | * **distributed.yaml**: Same as `const.yaml`, but trains the model with multiple GPUs (distributed training).
15 | * **adaptive.yaml**: Perform a hyperparameter search using Determined's state-of-the-art adaptive hyperparameter tuning algorithm.
16 | 
17 | ## Data
18 | The current implementation uses MNIST data downloaded from AWS S3.
19 | 
20 | ## To Run
21 | If you have not yet installed Determined, installation instructions can be found
22 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html
23 | 
24 | Run the following command: `det -m <master host:port> experiment create -f 
25 | const.yaml .`. The other configurations can be run by specifying the appropriate 
26 | configuration file in place of `const.yaml`.
27 | 
28 | ## Results
29 | Training the model with the hyperparameter settings in `const.yaml` should yield
30 | a validation accuracy of ~97%. 
31 | 


--------------------------------------------------------------------------------
/model_hub/huggingface/question-answering/squad_v2.yaml:
--------------------------------------------------------------------------------
 1 | name: huggingface_squad_v2
 2 | hyperparameters:
 3 |   pretrained_model_name_or_path: bert-base-uncased
 4 |   model_mode: question-answering
 5 |   use_pretrained_weights: true
 6 |   use_apex_amp: false
 7 |   cache_dir: null
 8 |   # Training Args
 9 |   global_batch_size: 12
10 |   learning_rate: 3.0e-5
11 |   adam_epsilon: 1.0e-8
12 |   weight_decay: 0
13 |   lr_scheduler_type: linear
14 |   num_warmup_steps: 0
15 | data:
16 |   dataset_name: squad_v2
17 |   train_file: null
18 |   validation_file: null
19 |   overwrite_cache: false
20 |   preprocessing_num_workers: null
21 |   max_seq_length: 384
22 |   pad_to_max_length: true
23 |   version_2_with_negative: true
24 |   null_score_diff_threshold: 0
25 |   doc_stride: 128
26 |   n_best_size: 20
27 |   max_answer_length: 30
28 |   output_dir: /tmp
29 | # Number of records per epoch differs based on max_seq_length.
30 | records_per_epoch: 131754
31 | min_validation_period:
32 |   batches: 5000
33 | searcher:
34 |   name: single
35 |   metric: f1
36 |   max_length:
37 |     epochs: 4
38 |   smaller_is_better: false
39 | environment:
40 |   image: 
41 |     gpu: determinedai/model-hub-transformers:0.26.2-dev0
42 | resources:
43 |   slots_per_trial: 1
44 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the
45 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance.
46 | bind_mounts:
47 |   - host_path: /tmp
48 |     container_path: /root/.cache
49 |   - host_path: /tmp
50 |     container_path: /tmp
51 | entrypoint: qa_trial:QATrial
52 | 


--------------------------------------------------------------------------------
/model_hub/huggingface/question-answering/squad_beam_search.yaml:
--------------------------------------------------------------------------------
 1 | name: huggingface_squad_with_beam_search
 2 | hyperparameters:
 3 |   pretrained_model_name_or_path: xlnet-large-cased
 4 |   model_mode: question-answering
 5 |   use_pretrained_weights: true
 6 |   use_apex_amp: false
 7 |   cache_dir: null
 8 |   # Training Args
 9 |   global_batch_size: 4
10 |   learning_rate: 3.0e-5
11 |   adam_epsilon: 1.0e-8
12 |   weight_decay: 0
13 |   lr_scheduler_type: linear
14 |   num_warmup_steps: 0
15 | data:
16 |   dataset_name: squad
17 |   train_file: null
18 |   validation_file: null
19 |   overwrite_cache: false
20 |   preprocessing_num_workers: null
21 |   max_seq_length: 384
22 |   pad_to_max_length: true
23 |   version_2_with_negative: false
24 |   doc_stride: 128
25 |   n_best_size: 20
26 |   max_answer_length: 30
27 |   output_dir: /tmp
28 | # Number of records per epoch differs based on max_seq_length.
29 | records_per_epoch: 88835
30 | min_validation_period:
31 |   batches: 5000
32 | searcher:
33 |   name: single
34 |   metric: f1
35 |   max_length:
36 |     epochs: 2
37 |   smaller_is_better: false
38 | environment:
39 |   image: 
40 |     gpu: determinedai/model-hub-transformers:0.26.2-dev0
41 | resources:
42 |   slots_per_trial: 1
43 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the
44 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance.
45 | bind_mounts:
46 |   - host_path: /tmp
47 |     container_path: /root/.cache
48 |   - host_path: /tmp
49 |     container_path: /tmp
50 | entrypoint: qa_beam_search_trial:QABeamSearchTrial
51 | 


--------------------------------------------------------------------------------
/features/torch_batch_process_core_api_comparison/README.md:
--------------------------------------------------------------------------------
 1 | # Batch inference with Core API & Torch Batch Processing API
 2 | 
 3 | ## Overview
 4 | 
 5 | This example illustrates how to run distributed batch inference with Core API. Determined's Core API is very flexible
 6 | and can be used to run almost anything, including batch inference. 
 7 | 
 8 | With Core API, we are able to write an example that
 9 | - is distributed across worker
10 | - can be preempted and resumed
11 | - can be monitored on the Determined UI
12 | 
13 | However, using Core API directly would require the user to directly handle 
14 | - low-level parallel programming concepts such as gather, rank 
15 | - Determined machinery such as creating and loading checkpoint, preemption and resumption
16 | - initialization of appropriate distributed context
17 | - proper sharding of dataset
18 | 
19 | You will see that using the Torch Batch Processing API for the same task is a lot easier as it abstracted away all the 
20 | low level details and provides useful helper functions.
21 | 
22 | ## Detailed on this example
23 | 
24 | We are running inference with a simple vision model on the CIFAR10 dataset. We then store the prediction outcome to the
25 | file system in the Core API example and to the same storage system used by Determined checkpoints in the 
26 | `torch_batch_process` example. You can access the output through the underlying storage (e.g. s3 bucket, shared_fs).
27 | 
28 | To run the Core API example, simply run `det e create core_api_config.yaml .`
29 | To run the Torch Batch Processing example, simply run `det e create torch_batch_process_config.yaml .`
30 | 


--------------------------------------------------------------------------------
/gan/cyclegan/utils.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import time
 3 | import datetime
 4 | import sys
 5 | 
 6 | from torch.autograd import Variable
 7 | import torch
 8 | import numpy as np
 9 | 
10 | from torchvision.utils import save_image
11 | 
12 | 
13 | class ReplayBuffer:
14 |     def __init__(self, max_size=50):
15 |         assert max_size > 0, "Empty buffer or trying to create a black hole. Be careful."
16 |         self.max_size = max_size
17 |         self.data = []
18 | 
19 |     def push_and_pop(self, data):
20 |         to_return = []
21 |         for element in data.data:
22 |             element = torch.unsqueeze(element, 0)
23 |             if len(self.data) < self.max_size:
24 |                 self.data.append(element)
25 |                 to_return.append(element)
26 |             else:
27 |                 if random.uniform(0, 1) > 0.5:
28 |                     i = random.randint(0, self.max_size - 1)
29 |                     to_return.append(self.data[i].clone())
30 |                     self.data[i] = element
31 |                 else:
32 |                     to_return.append(element)
33 |         return Variable(torch.cat(to_return))
34 | 
35 | 
36 | class LambdaLR:
37 |     def __init__(self, n_epochs, offset, decay_start_epoch):
38 |         assert (n_epochs - decay_start_epoch) > 0, "Decay must start before the training session ends!"
39 |         self.n_epochs = n_epochs
40 |         self.offset = offset
41 |         self.decay_start_epoch = decay_start_epoch
42 | 
43 |     def step(self, epoch):
44 |         return 1.0 - max(0, epoch + self.offset - self.decay_start_epoch) / (self.n_epochs - self.decay_start_epoch)
45 | 


--------------------------------------------------------------------------------
/gan/dcgan_tf_keras/README.md:
--------------------------------------------------------------------------------
 1 | # DCGAN TensorFlow Keras GAN Example
 2 | 
 3 | This example demonstrates how to build a simple GAN on the MNIST dataset using Determined's TensorFlow Keras API. This example is adapted from this [TensorFlow Tutorial](https://www.tensorflow.org/tutorials/generative/dcgan).
 4 | The DCGAN Keras model featured in this example subclasses `tf.keras.Model` and defines a custom `train_step()` and `test_step()`. This functionality was first added in TensorFlow 2.2.
 5 | 
 6 | ## Files
 7 | * **dc_gan.py**: The code code defining the model.
 8 | * **data.py**: The data loading and preparation code for the model.
 9 | * **model_def.py**: Organizes the model into Determined's TensorFlow Keras API.
10 | * **export.py**: Exports a trained checkpoint and uses it to generate images.
11 | 
12 | 
13 | ### Configuration Files
14 | * **const.yaml**: Train the model with constant hyperparameter values.
15 | * **distributed.yaml**: Same as const.yaml, but instead uses multiple GPUs (distributed training).
16 | 
17 | ## To Run
18 | Installation instructions can be found under `docs/install-admin.html` or at [Determined installation page](https://docs.determined.ai/latest/index.html).
19 | After configuring the settings in `const.yaml`, run the following command: `det -m <master host:port> experiment create -f const.yaml . `
20 | 
21 | ## To Export
22 | Once the model has been trained, its top checkpoint can be exported and used to generate images by running:
23 | ```bash
24 | python export.py --experiment-id <experimend_id> --master-url <master:port>
25 | ```
26 | 
27 | ![Generate Images](./images/dcgan_inference_example.png)
28 | 


--------------------------------------------------------------------------------
/model_hub/huggingface/question-answering/squad_distributed.yaml:
--------------------------------------------------------------------------------
 1 | name: huggingface_squad_distributed
 2 | hyperparameters:
 3 |   pretrained_model_name_or_path: bert-large-uncased-whole-word-masking
 4 |   model_mode: question-answering
 5 |   use_pretrained_weights: true
 6 |   use_apex_amp: false
 7 |   cache_dir: null
 8 |   # Training Args
 9 |   global_batch_size: 24
10 |   learning_rate: 3e-5
11 |   adam_epsilon: 1e-8
12 |   weight_decay: 0
13 |   lr_scheduler_type: linear
14 |   num_warmup_steps: 0
15 | data:
16 |   dataset_name: squad
17 |   train_file: null
18 |   validation_file: null
19 |   overwrite_cache: false
20 |   preprocessing_num_workers: null
21 |   max_seq_length: 384
22 |   pad_to_max_length: true
23 |   version_2_with_negative: false
24 |   null_score_diff_threshold: 0
25 |   doc_stride: 128
26 |   n_best_size: 20
27 |   max_answer_length: 30
28 |   output_dir: /tmp
29 | # Number of records per epoch differs based on max_seq_length.
30 | records_per_epoch: 88524
31 | min_validation_period:
32 |   batches: 5000
33 | searcher:
34 |   name: single
35 |   metric: f1
36 |   max_length:
37 |     epochs: 2
38 |   smaller_is_better: false
39 | environment:
40 |   image: 
41 |     gpu: determinedai/model-hub-transformers:0.26.2-dev0
42 | resources:
43 |   slots_per_trial: 8
44 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the
45 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance.
46 | bind_mounts:
47 |   - host_path: /tmp
48 |     container_path: /root/.cache
49 |   - host_path: /tmp
50 |     container_path: /tmp
51 | entrypoint: qa_trial:QATrial
52 | 


--------------------------------------------------------------------------------
/blog/lora-parameters/lora.yaml:
--------------------------------------------------------------------------------
 1 | name: mistral lora hard
 2 | debug: false
 3 | environment:
 4 |   environment_variables:
 5 |     - NCCL_DEBUG=INFO
 6 |     - NCCL_SOCKET_IFNAME=ens,eth,ib
 7 |   image: 
 8 |     gpu: determinedai/environments:cuda-11.8-pytorch-2.0-gpu-95c7a14
 9 |     cpu: determinedai/environments:py-3.10-pytorch-2.0-cpu-03ae7d7
10 | resources:
11 |   slots_per_trial: 2
12 |   resource_pool: <RESOURCE_POOL> # We used A100 40GB GPUs
13 | workspace: <WORKSPACE_NAME>
14 | project: <PROJECT>
15 | searcher:
16 |   name: grid
17 |   max_length:
18 |     batches: 3000
19 |   metric: eval_accuracy
20 |   smaller_is_better: false
21 | hyperparameters:
22 |   model: "mistralai/Mistral-7B-Instruct-v0.2"
23 |   model_commit_hash: "99259002b41e116d28ccb2d04a9fbe22baed0c7f"
24 |   dataset_subset: "hard"
25 |   lora: true
26 |   r:
27 |     type: categorical
28 |     vals: [2, 8, 32, 128]
29 |   lora_alpha:
30 |     type: categorical
31 |     vals: [0.5, 1, 2, 8, 32, 128, 256, 512]
32 |   lora_dropout: 
33 |     type: categorical
34 |     vals: [0.1]
35 |   hf_token: <HF_TOKEN>
36 |   training_args:
37 |     output_dir: "/tmp/llm_finetuning"
38 |     max_steps: 3000
39 |     per_device_train_batch_size: 4
40 |     per_device_eval_batch_size: 4
41 |     bf16: true
42 |     evaluation_strategy: "steps"
43 |     eval_steps: 500
44 |     logging_strategy: "steps"
45 |     logging_steps: 100
46 |     save_strategy: "steps"
47 |     save_steps: 1000
48 |     learning_rate: 1e-5
49 |     deepspeed: true
50 |     gradient_checkpointing: true
51 |   use_rslora: false
52 | entrypoint: >-
53 |   python -m determined.launch.torch_distributed
54 |   python finetune.py
55 | max_restarts: 0


--------------------------------------------------------------------------------
/model_hub/huggingface/question-answering/squad_v2_beam_search.yaml:
--------------------------------------------------------------------------------
 1 | name: huggingface_squad_v2_with_beam_search
 2 | hyperparameters:
 3 |   pretrained_model_name_or_path: xlnet-large-cased
 4 |   model_mode: question-answering
 5 |   use_pretrained_weights: true
 6 |   use_apex_amp: false
 7 |   cache_dir: null
 8 |   # Training Args
 9 |   global_batch_size: 4
10 |   learning_rate: 3.0e-5
11 |   adam_epsilon: 1.0e-8
12 |   weight_decay: 0
13 |   lr_scheduler_type: linear
14 |   num_warmup_steps: 0
15 | data:
16 |   dataset_name: squad_v2
17 |   train_file: null
18 |   validation_file: null
19 |   overwrite_cache: false
20 |   preprocessing_num_workers: null
21 |   max_seq_length: 384
22 |   pad_to_max_length: true
23 |   version_2_with_negative: true
24 |   null_score_diff_threshold: 0
25 |   doc_stride: 128
26 |   n_best_size: 20
27 |   max_answer_length: 30
28 |   output_dir: /tmp
29 | # Number of records per epoch differs based on max_seq_length.
30 | records_per_epoch: 132240
31 | min_validation_period:
32 |   batches: 5000
33 | searcher:
34 |   name: single
35 |   metric: f1
36 |   max_length:
37 |     epochs: 4
38 |   smaller_is_better: false
39 | environment:
40 |   image: 
41 |     gpu: determinedai/model-hub-transformers:0.26.2-dev0
42 | resources:
43 |   slots_per_trial: 1
44 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the
45 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance.
46 | bind_mounts:
47 |   - host_path: /tmp
48 |     container_path: /root/.cache
49 |   - host_path: /tmp
50 |     container_path: /tmp
51 | entrypoint: qa_beam_search_trial:QABeamSearchTrial
52 | 


--------------------------------------------------------------------------------
/computer_vision/detr_coco_pytorch/const_fake.yaml:
--------------------------------------------------------------------------------
 1 | name: detr_coco_fake_data
 2 | hyperparameters:
 3 |     lr: 1.0e-4
 4 |     lr_backbone: 1.0e-5
 5 |     global_batch_size: 2
 6 |     weight_decay: 1.0e-4
 7 |     lr_drop: 100
 8 |     clip_max_norm: 0.1
 9 | 
10 |     # Set to true if you want to warmstart with pretrained weights.
11 |     warmstart: false
12 | 
13 |     # Backbone
14 |     backbone: resnet50
15 |     dilation: false
16 |     position_embedding: sine
17 | 
18 |     # Transformer
19 |     enc_layers: 6
20 |     dec_layers: 6
21 |     dim_feedforward: 2048
22 |     hidden_dim: 256
23 |     dropout: 0.1
24 |     nheads: 8
25 |     num_queries: 100
26 |     pre_norm: false
27 | 
28 |     # Loss
29 |     aux_loss: true
30 | 
31 |     # Matcher
32 |     set_cost_class: 1
33 |     set_cost_bbox: 5
34 |     set_cost_giou: 2
35 | 
36 |     # Loss Coefficients
37 |     mask_loss_coef: 1
38 |     dice_loss_coef: 1
39 |     bbox_loss_coef: 5
40 |     giou_loss_coef: 2
41 |     eos_coef: 0.1
42 | 
43 |     # Dataset
44 |     dataset_file: coco
45 |     backend: fake # specifiy the backend you want to use.  one of: gcs, aws, fake, local
46 |     data_dir: /data # bucket name if using gcs or aws, otherwise directory to dataset
47 |     masks: false
48 |     num_workers: 4
49 | 
50 |     device: cuda
51 | 
52 | bind_mounts:
53 |     - host_path: /tmp
54 |       container_path: /data
55 |       read_only: false
56 | 
57 | records_per_epoch: 117264
58 | searcher:
59 |     name: single
60 |     metric: mAP
61 |     smaller_is_better: false
62 |     max_length:
63 |         batches: 100 
64 | resources:
65 |     shm_size: 2000000000
66 | 
67 | entrypoint: model_def:DETRTrial
68 | 


--------------------------------------------------------------------------------
/features/checkpoint_hooks_pytorch/data.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import shutil
 4 | import urllib.parse
 5 | from typing import Any, Dict
 6 | 
 7 | import requests
 8 | from torchvision import datasets, transforms
 9 | 
10 | 
11 | def get_dataset(data_dir: str, train: bool) -> Any:
12 |     return datasets.MNIST(
13 |         data_dir,
14 |         train=train,
15 |         transform=transforms.Compose(
16 |             [
17 |                 transforms.ToTensor(),
18 |                 # These are the precomputed mean and standard deviation of the
19 |                 # MNIST data; this normalizes the data to have zero mean and unit
20 |                 # standard deviation.
21 |                 transforms.Normalize((0.1307,), (0.3081,)),
22 |             ]
23 |         ),
24 |     )
25 | 
26 | 
27 | def download_dataset(download_directory: str, data_config: Dict[str, Any]) -> str:
28 |     url = data_config["url"]
29 |     url_path = urllib.parse.urlparse(url).path
30 |     basename = url_path.rsplit("/", 1)[1]
31 | 
32 |     download_directory = os.path.join(download_directory, "MNIST")
33 |     os.makedirs(download_directory, exist_ok=True)
34 |     filepath = os.path.join(download_directory, basename)
35 |     if not os.path.exists(filepath):
36 |         logging.info("Downloading {} to {}".format(url, filepath))
37 | 
38 |         r = requests.get(url, stream=True)
39 |         with open(filepath, "wb") as f:
40 |             for chunk in r.iter_content(chunk_size=8192):
41 |                 if chunk:
42 |                     f.write(chunk)
43 | 
44 |     shutil.unpack_archive(filepath, download_directory)
45 | 
46 |     return os.path.dirname(download_directory)
47 | 


--------------------------------------------------------------------------------
/features/custom_reducers_mnist_pytorch/data.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import shutil
 4 | import urllib.parse
 5 | from typing import Any, Dict
 6 | 
 7 | import requests
 8 | from torchvision import datasets, transforms
 9 | 
10 | 
11 | def get_dataset(data_dir: str, train: bool) -> Any:
12 |     return datasets.MNIST(
13 |         data_dir,
14 |         train=train,
15 |         transform=transforms.Compose(
16 |             [
17 |                 transforms.ToTensor(),
18 |                 # These are the precomputed mean and standard deviation of the
19 |                 # MNIST data; this normalizes the data to have zero mean and unit
20 |                 # standard deviation.
21 |                 transforms.Normalize((0.1307,), (0.3081,)),
22 |             ]
23 |         ),
24 |     )
25 | 
26 | 
27 | def download_dataset(download_directory: str, data_config: Dict[str, Any]) -> str:
28 |     url = data_config["url"]
29 |     url_path = urllib.parse.urlparse(url).path
30 |     basename = url_path.rsplit("/", 1)[1]
31 | 
32 |     download_directory = os.path.join(download_directory, "MNIST")
33 |     os.makedirs(download_directory, exist_ok=True)
34 |     filepath = os.path.join(download_directory, basename)
35 |     if not os.path.exists(filepath):
36 |         logging.info("Downloading {} to {}".format(url, filepath))
37 | 
38 |         r = requests.get(url, stream=True)
39 |         with open(filepath, "wb") as f:
40 |             for chunk in r.iter_content(chunk_size=8192):
41 |                 if chunk:
42 |                     f.write(chunk)
43 | 
44 |     shutil.unpack_archive(filepath, download_directory)
45 | 
46 |     return os.path.dirname(download_directory)
47 | 


--------------------------------------------------------------------------------
/features/hp_constraints_mnist_pytorch/data.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import shutil
 4 | import urllib.parse
 5 | from typing import Any, Dict
 6 | 
 7 | import requests
 8 | from torchvision import datasets, transforms
 9 | 
10 | 
11 | def get_dataset(data_dir: str, train: bool) -> Any:
12 |     return datasets.MNIST(
13 |         data_dir,
14 |         train=train,
15 |         transform=transforms.Compose(
16 |             [
17 |                 transforms.ToTensor(),
18 |                 # These are the precomputed mean and standard deviation of the
19 |                 # MNIST data; this normalizes the data to have zero mean and unit
20 |                 # standard deviation.
21 |                 transforms.Normalize((0.1307,), (0.3081,)),
22 |             ]
23 |         ),
24 |     )
25 | 
26 | 
27 | def download_dataset(download_directory: str, data_config: Dict[str, Any]) -> str:
28 |     url = data_config["url"]
29 |     url_path = urllib.parse.urlparse(url).path
30 |     basename = url_path.rsplit("/", 1)[1]
31 | 
32 |     download_directory = os.path.join(download_directory, "MNIST")
33 |     os.makedirs(download_directory, exist_ok=True)
34 |     filepath = os.path.join(download_directory, basename)
35 |     if not os.path.exists(filepath):
36 |         logging.info("Downloading {} to {}".format(url, filepath))
37 | 
38 |         r = requests.get(url, stream=True)
39 |         with open(filepath, "wb") as f:
40 |             for chunk in r.iter_content(chunk_size=8192):
41 |                 if chunk:
42 |                     f.write(chunk)
43 | 
44 |     shutil.unpack_archive(filepath, download_directory)
45 | 
46 |     return os.path.dirname(download_directory)
47 | 


--------------------------------------------------------------------------------
/model_hub/huggingface/question-answering/squad_v2_albert.yaml:
--------------------------------------------------------------------------------
 1 | name: huggingface_squad_v2_albert
 2 | hyperparameters:
 3 |   pretrained_model_name_or_path: albert-xxlarge-v2
 4 |   model_mode: question-answering
 5 |   use_pretrained_weights: true
 6 |   use_apex_amp: false
 7 |   cache_dir: null
 8 |   # Training Args
 9 |   global_batch_size: 16
10 |   learning_rate: 5e-5
11 |   adam_epsilon: 1e-8
12 |   weight_decay: 0
13 |   lr_scheduler_type: linear
14 |   num_warmup_steps: 1620
15 | data:
16 |   dataset_name: squad_v2
17 |   train_file: null
18 |   validation_file: null
19 |   overwrite_cache: false
20 |   preprocessing_num_workers: null
21 |   max_seq_length: 384
22 |   pad_to_max_length: true
23 |   version_2_with_negative: true
24 |   null_score_diff_threshold: 0
25 |   doc_stride: 128
26 |   n_best_size: 20
27 |   max_answer_length: 30
28 |   output_dir: /tmp
29 | optimizations:
30 |   aggregation_frequency: 3
31 | # Number of records per epoch differs based on max_seq_length.
32 | records_per_epoch: 131754
33 | min_validation_period:
34 |   batches: 5000
35 | searcher:
36 |   name: single
37 |   metric: f1
38 |   max_length:
39 |     batches: 16500
40 |   smaller_is_better: false
41 | environment:
42 |   image: 
43 |     gpu: determinedai/model-hub-transformers:0.26.2-dev0
44 | resources:
45 |   slots_per_trial: 8
46 | # We add a bind_mount here so that cached data, tokenized data, and models will be saved to the
47 | # host_path on the agent instance disk for reuse if the same experiment is run on this instance.
48 | bind_mounts:
49 |   - host_path: /tmp
50 |     container_path: /root/.cache
51 |   - host_path: /tmp
52 |     container_path: /tmp
53 | entrypoint: qa_trial:QATrial
54 | 


--------------------------------------------------------------------------------
/blog/python_sdk_demo/mednist_model/net.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | 
 3 | 
 4 | # from https://github.com/MedMNIST/MedMNIST/blob/main/examples/getting_started.ipynb
 5 | class Net(nn.Module):
 6 |     def __init__(self, in_channels, num_classes):
 7 |         super().__init__()
 8 | 
 9 |         self.layer1 = nn.Sequential(
10 |             nn.Conv2d(in_channels, 16, kernel_size=3), nn.BatchNorm2d(16), nn.ReLU()
11 |         )
12 | 
13 |         self.layer2 = nn.Sequential(
14 |             nn.Conv2d(16, 16, kernel_size=3),
15 |             nn.BatchNorm2d(16),
16 |             nn.ReLU(),
17 |             nn.MaxPool2d(kernel_size=2, stride=2),
18 |         )
19 | 
20 |         self.layer3 = nn.Sequential(
21 |             nn.Conv2d(16, 64, kernel_size=3), nn.BatchNorm2d(64), nn.ReLU()
22 |         )
23 | 
24 |         self.layer4 = nn.Sequential(
25 |             nn.Conv2d(64, 64, kernel_size=3), nn.BatchNorm2d(64), nn.ReLU()
26 |         )
27 | 
28 |         self.layer5 = nn.Sequential(
29 |             nn.Conv2d(64, 64, kernel_size=3, padding=1),
30 |             nn.BatchNorm2d(64),
31 |             nn.ReLU(),
32 |             nn.MaxPool2d(kernel_size=2, stride=2),
33 |         )
34 | 
35 |         self.fc = nn.Sequential(
36 |             nn.Linear(64 * 4 * 4, 128),
37 |             nn.ReLU(),
38 |             nn.Linear(128, 128),
39 |             nn.ReLU(),
40 |             nn.Linear(128, num_classes),
41 |         )
42 | 
43 |     def forward(self, x):
44 |         x = self.layer1(x)
45 |         x = self.layer2(x)
46 |         x = self.layer3(x)
47 |         x = self.layer4(x)
48 |         x = self.layer5(x)
49 |         x = x.view(x.size(0), -1)
50 |         x = self.fc(x)
51 |         return x
52 | 


--------------------------------------------------------------------------------
/custom_search_method/asha_search_method/experiment_files/data.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import shutil
 4 | import urllib.parse
 5 | from typing import Any, Dict
 6 | 
 7 | import requests
 8 | from torchvision import datasets, transforms
 9 | 
10 | 
11 | def get_dataset(data_dir: str, train: bool) -> Any:
12 |     return datasets.MNIST(
13 |         data_dir,
14 |         train=train,
15 |         transform=transforms.Compose(
16 |             [
17 |                 transforms.ToTensor(),
18 |                 # These are the precomputed mean and standard deviation of the
19 |                 # MNIST data; this normalizes the data to have zero mean and unit
20 |                 # standard deviation.
21 |                 transforms.Normalize((0.1307,), (0.3081,)),
22 |             ]
23 |         ),
24 |     )
25 | 
26 | 
27 | def download_dataset(download_directory: str, data_config: Dict[str, Any]) -> str:
28 |     url = data_config["url"]
29 |     url_path = urllib.parse.urlparse(url).path
30 |     basename = url_path.rsplit("/", 1)[1]
31 | 
32 |     download_directory = os.path.join(download_directory, "MNIST")
33 |     os.makedirs(download_directory, exist_ok=True)
34 |     filepath = os.path.join(download_directory, basename)
35 |     if not os.path.exists(filepath):
36 |         logging.info("Downloading {} to {}".format(url, filepath))
37 | 
38 |         r = requests.get(url, stream=True)
39 |         with open(filepath, "wb") as f:
40 |             for chunk in r.iter_content(chunk_size=8192):
41 |                 if chunk:
42 |                     f.write(chunk)
43 | 
44 |     shutil.unpack_archive(filepath, download_directory)
45 | 
46 |     return os.path.dirname(download_directory)
47 | 


--------------------------------------------------------------------------------
/model_hub/mmdetection/hydra/README.md:
--------------------------------------------------------------------------------
 1 | # Using model-hub mmdetection with [Hydra](https://hydra.cc/)
 2 | Hydra is a framework for configuring applications that works very well with machine learning experiments.
 3 | You can use Determined's Python SDK with Hydra to:
 4 | * Easily submit experiments with different configurations
 5 | * Perform parameter sweeps
 6 | * Compose configurations
 7 | 
 8 | ## Setup
 9 | You need to install Determined and Hydra in order to try this out.
10 | ```
11 | pip install hydra-core>=1.1
12 | pip install determined
13 | ```
14 | 
15 | ## Submitting experiments
16 | Make sure the `DET_MASTER` environment variable is set.  Then you can create experiments by running
17 | ```
18 | python mmdet_experiment.py hyperparameters.config_file=mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py
19 | ```
20 | 
21 | Hydra makes it easy to modify the configuration from the CLI:
22 | ```
23 | python mmdet_experiment.py hyperparameters.config_file=faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py
24 | ```
25 | 
26 | Or try multiple values:
27 | ```
28 | python mmdet_experiment.py --multirun \
29 |     hyperparameters.config_file=faster_rcnn/faster_rcnn_r50_fpn_1x_coco.py,detr/detr_r50_8x2_150e_coco.py
30 | ```
31 | 
32 | Configuration with Hydra is also highly flexible and extensible.
33 | For example, you can run hyperparameter search on the optimizer learning rate by
34 | ```
35 | python mmdet_experiment.py searcher=adaptive +hyperparameters=tune_optimizer hyperparameters.config_file=mask_rcnn/mask_rcnn_r50_fpn_1x_coco.py
36 | ```
37 | You can look the [config directory](configs) to see how we use some of this functionality.  Feel free to add your own configs as needed to further customize the behavior.
38 | 


--------------------------------------------------------------------------------
/computer_vision/iris_tf_keras/README.md:
--------------------------------------------------------------------------------
 1 | # TensorFlow (tf.keras) Iris Species Categorization Example
 2 | 
 3 | This example shows how to run a CNN on the Iris species dataset using
 4 | Determined's tf.keras API. This example is adapted from this [Iris species 
 5 | categorization medium post](https://medium.com/@nickbortolotti/iris-species-categorization-using-tf-keras-tf-data-and-differences-between-eager-mode-on-and-off-9b4693e0b22).
 6 | 
 7 | ## Files
 8 | * **model_def.py**: The core code for the model. This includes building and compiling the model.
 9 | * **startup-hook.sh**: Additional dependencies that Determined will automatically install into each container for this experiment.
10 | 
11 | ### Configuration Files
12 | * **const.yaml**: Train the model with constant hyperparameter values.
13 | * **distributed.yaml**: Same as `const.yaml`, but trains the model with multiple GPUs (distributed training).
14 | * **adaptive.yaml**: Perform a hyperparameter search using Determined's state-of-the-art adaptive hyperparameter tuning algorithm.
15 | 
16 | ## Data:
17 | The current implementation uses [UCI's Iris Data Set](https://archive.ics.uci.edu/ml/datasets/iris).
18 | 
19 | ## To Run:
20 | If you have not yet installed Determined, installation instructions can be found
21 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html
22 | 
23 | Run the following command: `det -m <master host:port> experiment create -f 
24 | const.yaml .`. The other configurations can be run by specifying the appropriate 
25 | configuration file in place of `const.yaml`.
26 | 
27 | ## Results:
28 | Training the model with the hyperparameter settings in `const.yaml` should yield 
29 | a validation accuracy of ~95%.
30 | 


--------------------------------------------------------------------------------
/blog/llm-finetuning-3/dpo.yaml:
--------------------------------------------------------------------------------
 1 | name: gemma-2b dpo
 2 | debug: false
 3 | environment:
 4 |   environment_variables:
 5 |     - NCCL_DEBUG=INFO
 6 |   image: determinedai/genai-train:latest
 7 | resources:
 8 |   slots_per_trial: 2
 9 |   resource_pool: A100
10 |   max_slots: 8
11 | searcher:
12 |   name: grid
13 |   max_length:
14 |     batches: 5000
15 |   metric: eval_accuracy
16 |   smaller_is_better: false
17 | hyperparameters:
18 |   model_name: "google/gemma-2b-it"
19 |   # model_ckpt: "6b6fbaa7-faa9-4449-867b-2939a147a335"
20 |   datasets:
21 |     - "argilla/dpo-mix-7k"
22 |     - "jondurbin/truthy-dpo-v0.1"
23 |   dpo_beta:
24 |     type: categorical
25 |     vals:
26 |       - 0.1
27 |       - 0.05
28 |       - 0.01
29 |   dpo_loss: "sigmoid"
30 |   max_length: 4096
31 |   max_prompt_length: 2048
32 |   max_target_length: 2048
33 |   precompute_ref_log_probs: true
34 |   training_args:
35 |     output_dir: "/tmp/llm_finetuning"
36 |     num_train_epochs: 2
37 |     per_device_train_batch_size: 1
38 |     per_device_eval_batch_size: 1
39 |     bf16: true
40 |     bf16_full_eval: true
41 |     evaluation_strategy: "steps"
42 |     eval_steps: 100
43 |     logging_strategy: "steps"
44 |     logging_steps: 10
45 |     save_strategy: "epoch"
46 |     save_steps: 1
47 |     learning_rate:
48 |       type: categorical
49 |       vals:
50 |         - 1e-7
51 |         - 5e-7
52 |         - 5e-8
53 |     gradient_accumulation_steps: 8
54 |     gradient_checkpointing: true
55 |     deepspeed: "ds_configs/ds_config_stage_2.json"
56 |     warmup_ratio: 0.1
57 |     lr_scheduler_type: "cosine"
58 |     optim: "adamw_torch"
59 | entrypoint: >-
60 |   python -m determined.launch.deepspeed
61 |   python dpo_finetune.py
62 | max_restarts: 0


--------------------------------------------------------------------------------
/computer_vision/detr_coco_pytorch/data_utils.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | from io import BytesIO
 4 | from shutil import unpack_archive
 5 | from tempfile import NamedTemporaryFile
 6 | 
 7 | import requests
 8 | 
 9 | 
10 | def download_file(url, output_dir):
11 |     local_filename = os.path.join(output_dir, url.split("/")[-1])
12 |     with requests.get(url, stream=True) as r:
13 |         r.raise_for_status()
14 |         with open(local_filename, "wb") as f:
15 |             for chunk in r.iter_content(chunk_size=8192):
16 |                 # If you have chunk encoded response uncomment the line below and set chunk_size parameter to None.
17 |                 # if chunk:
18 |                 f.write(chunk)
19 |     return local_filename
20 | 
21 | 
22 | async def download_and_extract_url(zipurl, outdir):
23 |     filename = download_file(zipurl, outdir)
24 |     with open(filename, "rb") as f, NamedTemporaryFile() as tfile:
25 |         tfile.write(f.read())
26 |         tfile.seek(0)
27 |         unpack_archive(tfile.name, outdir, format="zip")
28 |         print("finished extracting: {}".format(zipurl))
29 |     await asyncio.sleep(1)
30 | 
31 | 
32 | def async_download_url_list(url_list, outdir):
33 |     loop = asyncio.get_event_loop()
34 |     tasks = [asyncio.ensure_future(download_and_extract_url(url, outdir)) for url in url_list]
35 |     loop.run_until_complete(asyncio.gather(*tasks))
36 | 
37 | 
38 | def download_coco_from_source(data_dir):
39 |     url_list = [
40 |         "http://images.cocodataset.org/zips/train2017.zip",
41 |         "http://images.cocodataset.org/zips/val2017.zip",
42 |     ]
43 |     async_download_url_list(url_list, data_dir)
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     download_coco_from_source("/tmp")
48 | 


--------------------------------------------------------------------------------
/custom_search_method/asha_search_method/remote_search_runner/README.md:
--------------------------------------------------------------------------------
 1 | # Custom SearchMethod with RemoteSearchRunner
 2 | 
 3 | In this example, we use RemoteSearchRunner, which executes a custom SearchMethod as a single trial experiment and
 4 | orchestrates a multi-trial experiment. Both the custom SearchMethod and the multi-trial experiment are executed
 5 | on the Determined cluster.
 6 | 
 7 | For an example of running the custom SearchMethod locally,
 8 | see `examples/custom_search_method/asha_custom_search_method/local_search_runner`.
 9 | 
10 | ## Files
11 | * **run_experiment.py**: The code for running a custom SearchMethod with RemoteSearchRunner.
12 | 
13 | ### Configuration Files
14 | * **searcher.yaml**: Configuration for running custom SearchMethod as an experiment on the Determined cluster.
15 | 
16 | 
17 | ## To Run
18 | If you have not yet installed Determined, installation instructions can be found
19 | under `docs/install-admin.html` or at https://docs.determined.ai/latest/index.html
20 | 
21 | 1. Set the `DET_MASTER` environment variable, which is the network address of the Determined master.
22 | For instance, `export DET_MASTER=<master_host:port>`.
23 | 2. Run the following command in the `asha_search_method` directory to start RemoteSearchRunner on the Determined cluster:
24 | `det experiment create remote_search_runner/searcher.yaml .`.
25 | 
26 | ## Result
27 | RemoteSearchRunner is submitted to the Determined master as a single trial experiment.
28 | While running on the cluster, RemoteSearchRunner executes the custom SearchMethod and starts a multi-trial experiment
29 | for hyperparameter search. Similarly to LocalSearchRunner, RemoteSearchRunner handles the communication between the
30 | custom SearchMethod and the multi-trial experiment.


--------------------------------------------------------------------------------
/computer_vision/deformabledetr_coco_pytorch/data_utils.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import os
 3 | from io import BytesIO
 4 | from shutil import unpack_archive
 5 | from tempfile import NamedTemporaryFile
 6 | 
 7 | import requests
 8 | 
 9 | 
10 | def download_file(url, output_dir):
11 |     local_filename = os.path.join(output_dir, url.split("/")[-1])
12 |     with requests.get(url, stream=True) as r:
13 |         r.raise_for_status()
14 |         with open(local_filename, "wb") as f:
15 |             for chunk in r.iter_content(chunk_size=8192):
16 |                 # If you have chunk encoded response uncomment the line below and set chunk_size parameter to None.
17 |                 # if chunk:
18 |                 f.write(chunk)
19 |     return local_filename
20 | 
21 | 
22 | async def download_and_extract_url(zipurl, outdir):
23 |     filename = download_file(zipurl, outdir)
24 |     with open(filename, "rb") as f, NamedTemporaryFile() as tfile:
25 |         tfile.write(f.read())
26 |         tfile.seek(0)
27 |         unpack_archive(tfile.name, outdir, format="zip")
28 |         print("finished extracting: {}".format(zipurl))
29 |     await asyncio.sleep(1)
30 | 
31 | 
32 | def async_download_url_list(url_list, outdir):
33 |     loop = asyncio.get_event_loop()
34 |     tasks = [asyncio.ensure_future(download_and_extract_url(url, outdir)) for url in url_list]
35 |     loop.run_until_complete(asyncio.gather(*tasks))
36 | 
37 | 
38 | def download_coco_from_source(data_dir):
39 |     url_list = [
40 |         "http://images.cocodataset.org/zips/train2017.zip",
41 |         "http://images.cocodataset.org/zips/val2017.zip",
42 |     ]
43 |     async_download_url_list(url_list, data_dir)
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     download_coco_from_source("/tmp")
48 | 


--------------------------------------------------------------------------------
/blog/python_sdk_demo/README.md:
--------------------------------------------------------------------------------
 1 | # det-python-sdk-demo
 2 | 
 3 | ## Overview
 4 | 
 5 | This script shows example usage of the Determined Python SDK to run and administer experiments.
 6 | 
 7 | It:
 8 | 1. Archives any existing experiments with the same names as the datasets we'll train on.
 9 | 2. Creates models for each dataset and registers them in the Determined model registry.
10 | 3. Trains a model for each dataset by creating an experiment.
11 | 4. Registers the best checkpoint for each experiment in the Determined model registry.
12 | 
13 | For an in-depth discussion of this script, see the blog post:
14 |     https://www.determined.ai/blog/python-sdk
15 | 
16 | For more information on the Determined Python SDK, see:
17 |     https://docs.determined.ai/latest/reference/python-sdk.html
18 | 
19 | ## Installation / Execution
20 | 
21 | To run this demo:
22 | 
23 | 1. Install dependencies. In addition to the determined CLI, we this demo uses MedMNIST datasets.
24 | ```
25 | pip install -r requirements.txt
26 | ```
27 | 
28 | 2. Set DET_MASTER environment variable. For example, if you're running this locally:
29 | ```
30 | export DET_MASTER=localhost:8080
31 | ```
32 | 
33 | For more information about configuring the CLI, see [this doc](https://docs.determined.ai/latest/setup-cluster/setup-clients.html#setting-up-clients).
34 | 
35 | 3. Now the demo is ready to be executed. To run experiments:
36 | ```
37 | python determined_sdk_demo.py
38 | ```
39 | 
40 | ## Contributors
41 | 
42 | - [Wesley Turner](https://github.com/wes-turner)
43 | - [Kevin Musgrave](https://github.com/KevinMusgrave)
44 | 
45 | The code in the `medmnist_model` directory is based on the [`determined_medmnist_e2e`](https://github.com/ighodgao/determined_medmnist_e2e) repo by [Isha Ghodgaonkar](https://github.com/ighodgao).


--------------------------------------------------------------------------------
/gan/dcgan_tf_keras/export.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Shows an example of how model trained in Determined can be easily exported and used.
 3 | """
 4 | 
 5 | import argparse
 6 | 
 7 | import matplotlib.pyplot as plt
 8 | import tensorflow as tf
 9 | 
10 | from determined.experimental import client
11 | 
12 | 
13 | def generate_and_plot_images(generator: tf.keras.Sequential, noise_dim: int) -> None:
14 |     # Notice `training` is set to False.
15 |     # This is so all layers run in inference mode (batchnorm).
16 |     seed = tf.random.normal([16, noise_dim])
17 |     predictions = generator(seed, training=False)
18 | 
19 |     plt.figure(figsize=(4, 4))
20 | 
21 |     for i in range(predictions.shape[0]):
22 |         plt.subplot(4, 4, i + 1)
23 |         plt.imshow(predictions[i, :, :, 0] * 127.5 + 127.5, cmap="gray")
24 |         plt.axis("off")
25 |     plt.show()
26 | 
27 | 
28 | def export_model(experiment_id: int) -> tf.keras.Model:
29 |     checkpoint = client.get_experiment(experiment_id).top_checkpoint()
30 |     model = checkpoint.load()
31 |     return model
32 | 
33 | 
34 | def main():
35 |     parser = argparse.ArgumentParser(description="DCGan Model Export")
36 |     parser.add_argument("--experiment-id", type=int, required=True, help="Experiment ID to export.")
37 |     parser.add_argument("--master-url", type=str, default="", help="URL of the Determined master.")
38 |     parser.add_argument(
39 |         "--noise-dim",
40 |         type=int,
41 |         default=128,
42 |         help="Needs to match noise dim during training.",
43 |     )
44 |     args = parser.parse_args()
45 | 
46 |     client.login(args.master_url)
47 |     model = export_model(args.experiment_id)
48 |     generate_and_plot_images(model.generator, args.noise_dim)
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     main()
53 | 


--------------------------------------------------------------------------------
/deepspeed/pipeline_parallelism/alexnet.py:
--------------------------------------------------------------------------------
 1 | # Implementation of AlexNet for illustrative purposes. The train.py driver
 2 | # can import AlexNet from here or directly from torchvision.
 3 | #
 4 | # Taken from torchvision.models.alexnet:
 5 | # https://pytorch.org/docs/1.6.0/_modules/torchvision/models/alexnet.html#alexnet
 6 | 
 7 | 
 8 | import torch
 9 | import torch.nn as nn
10 | 
11 | 
12 | class AlexNet(nn.Module):
13 |     def __init__(self, num_classes=1000):
14 |         super(AlexNet, self).__init__()
15 |         self.features = nn.Sequential(
16 |             nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
17 |             nn.ReLU(inplace=True),
18 |             nn.MaxPool2d(kernel_size=3, stride=2),
19 |             nn.Conv2d(64, 192, kernel_size=5, padding=2),
20 |             nn.ReLU(inplace=True),
21 |             nn.MaxPool2d(kernel_size=3, stride=2),
22 |             nn.Conv2d(192, 384, kernel_size=3, padding=1),
23 |             nn.ReLU(inplace=True),
24 |             nn.Conv2d(384, 256, kernel_size=3, padding=1),
25 |             nn.ReLU(inplace=True),
26 |             nn.Conv2d(256, 256, kernel_size=3, padding=1),
27 |             nn.ReLU(inplace=True),
28 |             nn.MaxPool2d(kernel_size=3, stride=2),
29 |         )
30 |         self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
31 |         self.classifier = nn.Sequential(
32 |             nn.Dropout(),
33 |             nn.Linear(256 * 6 * 6, 4096),
34 |             nn.ReLU(inplace=True),
35 |             nn.Dropout(),
36 |             nn.Linear(4096, 4096),
37 |             nn.ReLU(inplace=True),
38 |             nn.Linear(4096, num_classes),
39 |         )
40 | 
41 |     def forward(self, x):
42 |         x = self.features(x)
43 |         x = self.avgpool(x)
44 |         x = torch.flatten(x, 1)
45 |         x = self.classifier(x)
46 |         return x
47 | 


--------------------------------------------------------------------------------