├── .DS_Store ├── .gitignore ├── 01_deepLearning ├── .DS_Store ├── 01_introduction_mnist.ipynb ├── 02_conv_networks.ipynb ├── README.md └── images │ ├── LinearModel_1.png │ ├── MaxpoolSample2.png │ ├── MnistExamples.png │ ├── Padding.png │ ├── ResNet.png │ ├── U-Nets.png │ ├── ViT.gif │ ├── _placeholder │ ├── activation.jpeg │ ├── activation_functions.png │ ├── bias_vs_variance.png │ ├── conv.png │ ├── convNN_BlockDiagram.jpeg │ ├── conv_layer.png │ ├── convnets-feature-maps.png │ ├── deep_nn.png │ ├── dropout.png │ ├── learning-rate-gradient-descent.jpeg │ ├── lr.jpeg │ ├── mnist_task.png │ ├── multiple_channels.png │ ├── nn1.pdf │ ├── nn1.png │ ├── nonconvex.png │ ├── one_more_layer.png │ ├── optimization_types.png │ ├── shallow_nn.png │ ├── test_data_rule.png │ ├── three_layer_network.png │ └── tiny_network.png ├── 02_dataPipelines ├── 00_tensorflowDatasetAPI │ ├── README.md │ ├── ilsvrc.json │ ├── ilsvrc_dataset.py │ ├── logdir │ │ └── .gitignore │ └── submit_polaris.sh ├── 01_pytorchDatasetAPI │ ├── README.md │ ├── imagenet_parallel.py │ ├── imagenet_serial.py │ ├── logdir │ │ └── .gitignore │ └── submit_polaris.sh ├── README.md └── images │ ├── ilsvrc_64threads.png │ ├── ilsvrc_64threads_zoom.png │ ├── ilsvrc_parallel.png │ ├── ilsvrc_serial.png │ ├── ilsvrc_serial_zoom.png │ ├── n01667778_12001.JPEG │ ├── n02094114_1205.JPEG │ └── pytorch_threading.png ├── 03_introlangmodels ├── 03_languagemodels.ipynb ├── 03_languagemodels_colab.ipynb ├── README.md ├── dataset │ ├── cached_lm_GPT2TokenizerFast_128_input.txt │ ├── cached_lm_GPT2TokenizerFast_128_input.txt.lock │ ├── cached_lm_GPT2TokenizerFast_128_test_input.txt │ ├── cached_lm_GPT2TokenizerFast_128_test_input.txt.lock │ ├── cached_lm_GPT2TokenizerFast_128_train_input.txt │ ├── cached_lm_GPT2TokenizerFast_128_train_input.txt.lock │ ├── input.txt │ ├── test_input.txt │ └── train_input.txt ├── images │ ├── Attention_Vis.png │ ├── BERT_Explanation.webp │ ├── BERT_input_sent.webp │ ├── Byte_Pair_enc.webp │ ├── Graphformer.png │ ├── LLM_Architectures.webp │ ├── LLM_Blackbox.png │ ├── LLM_Theoret_Apps.pptx │ ├── Lorem.png │ ├── Postitional_Embedding.webp │ ├── Protein-Structure-06.png │ ├── RNA-codons.svg.png │ ├── The_transformer_encoder_decoder_stack.png │ ├── The_transformer_encoders_decoders.png │ ├── Transformer_Arch.png │ ├── Transformer_Enc_Dec_Blocks.png │ ├── Transformer_decoder.png │ ├── WordPieceTok.webp │ ├── attention_is_all_you_need.png │ ├── chars-tokenization.png │ ├── decoder_only_block.png │ ├── en_chapter1_transformers_chrono.svg │ ├── encode_decode.png │ ├── encoder_with_tensors_2.png │ ├── genslm.png │ ├── gpt-2-layers-2.png │ ├── gpt2-output.png │ ├── gpt2-self-attention-example-2.png │ ├── one-hot-vocabulary-example.png │ ├── output_target_probability_distributions.png │ ├── positional_encoding.png │ ├── recurrent_nn.png │ ├── rnn.png │ ├── self-attention-and-masked-self-attention.png │ ├── self-attention-example-folders-3.png │ ├── self-attention-output.png │ ├── text-processing---machines-vs-humans.png │ ├── the_transformer_3.png │ ├── tokenize.png │ ├── tokenize_words.png │ ├── transformer-decoder-intro.png │ ├── transformer_attention_heads_z.png │ ├── transformer_decoder_output_softmax (1).png │ ├── transformer_decoder_output_softmax.png │ ├── transformer_decoding_1.gif │ ├── transformer_decoding_2.gif │ ├── transformer_logits_output_and_label.png │ ├── transformer_multi-headed_self-attention-recap.png │ ├── transformer_positional_encoding_vectors.png │ ├── transformer_self-attention_visualization.png │ ├── transformer_self-attention_visualization_3.png │ ├── vision-transformer-vit.png │ ├── viz-bert-voc-verbs.png │ └── wordembedding.png └── vocab.txt ├── 03_profileLearning ├── .gitignore ├── README.md ├── line_profiler │ ├── README.md │ ├── train_MNIST.py │ └── train_MNIST_iofix.py ├── reduced_precision │ ├── README.md │ ├── images │ │ ├── kernel-stats.png │ │ ├── profiler_overview.png │ │ ├── tf-stats.png │ │ ├── top10-ops.png │ │ ├── trace-viewer.png │ │ ├── trace-zoomed-float32.png │ │ └── trace-zoomed.png │ └── train_MNIST_tf_function_XLA_mixed.py ├── tf_function │ ├── README.md │ ├── train_MNIST_tf_function.py │ ├── train_MNIST_tf_function_XLA.py │ └── xla_bug_generated │ │ ├── 1628017669967300.module_0000.after_optimizations-buffer-assignment.txt │ │ ├── 1628017669967300.module_0000.after_optimizations.txt │ │ ├── 1628017669967300.module_0000.before_optimizations.txt │ │ ├── before_increase_dynamism_for_auto_jit_pass.pbtxt │ │ ├── before_increase_dynamism_for_auto_jit_pass_1.pbtxt │ │ ├── before_mark_for_compilation.pbtxt │ │ ├── mark_for_compilation.pbtxt │ │ ├── mark_for_compilation_annotated.pbtxt │ │ ├── module_0000.ir-no-opt-noconst.ll │ │ ├── module_0000.ir-no-opt.ll │ │ ├── module_0000.ir-with-opt-noconst.ll │ │ ├── module_0000.ir-with-opt.ll │ │ ├── module_0000.ptx │ │ └── module_0000.thunk_schedule ├── tf_profiler │ ├── README.md │ ├── images │ │ ├── kernel-stats.png │ │ ├── profiler_overview.png │ │ ├── tf-stats.png │ │ ├── top10_ops.png │ │ ├── trace-viewer-zoom.png │ │ └── trace-viewer.png │ └── train_MNIST_tf_function_XLA.py ├── train_MNIST.py └── train_MNIST_optimized.py ├── 04_distributedLearning ├── ATPESC_2024_AIMLTrack_DDL_Zheng.pdf ├── DDP │ └── 04_pytorch_cnn_ddp.py ├── DeepSpeed │ ├── 04_pytorch_cnn_ds.py │ └── ds_config.json ├── Horovod │ ├── 04_keras_cnn_concise.py │ ├── 04_keras_cnn_concise_hvd.py │ ├── 04_keras_cnn_verbose.py │ ├── 04_keras_cnn_verbose_hvd.py │ ├── 04_pytorch_cnn.py │ ├── 04_pytorch_cnn_hvd.py │ ├── HorovodTimeline │ │ ├── cpu.json │ │ └── gpu.json │ └── mpitrace │ │ ├── cpu │ │ ├── #mpi_profile.2500161.0# │ │ ├── mpi_profile.2500161.0 │ │ ├── mpi_profile.2500161.1 │ │ ├── mpi_profile.2500161.4 │ │ └── mpi_profile.2500161.5 │ │ └── gpu │ │ ├── mpi_profile.2497205.0 │ │ ├── mpi_profile.2497205.2 │ │ ├── mpi_profile.2497205.5 │ │ └── mpi_profile.2497205.6 ├── README.md ├── figures │ ├── Horovod.png │ ├── cpu_horovodtimeline.png │ ├── distributed.png │ ├── gpu_horovodtimeline.png │ └── resnet50.png ├── results │ ├── concise_1.out │ ├── concise_2.out │ ├── concise_4.out │ └── concise_8.out └── submissions │ ├── qsub_polaris.sc │ └── qsub_thetagpu.sc ├── 05_aiTestbed ├── .DS_Store ├── Cerebras │ ├── Cerebras_Wafer-Scale_Cluster_login_diagram.png │ ├── README.md │ └── gpt-j.md ├── Graphcore │ ├── README.md │ ├── gpt2.md │ └── graphcore_login.png ├── Groq │ ├── README.md │ ├── groqrack_system_diagram.png │ └── minilm.md ├── README.md ├── SambaNova │ ├── README.md │ ├── gpt15b.md │ └── sambanova_login.jpg └── Slides │ └── .DS_Store ├── README.md ├── README_imgs ├── .DS_Store ├── colab_clean_page.png ├── colab_open_github.png ├── colab_start_page.png └── collab_start_page1.png ├── extra_statisticalLearning ├── README.md ├── assets │ ├── atpesc-k-means-step1.svg │ ├── atpesc-k-means-step2.svg │ ├── atpesc-k-means-step3.svg │ ├── atpesc-k-means-step4.svg │ ├── learning-rate.svg │ └── pngs │ │ ├── atpesc-k-means-step1.png │ │ ├── atpesc-k-means-step2.png │ │ ├── atpesc-k-means-step3.png │ │ ├── atpesc-k-means-step4.png │ │ └── learning-rate.png ├── data │ ├── realestate_train.csv │ └── rna.csv ├── images │ ├── image.png │ ├── kmeans.png │ ├── mse.pdf │ ├── mse.png │ ├── mse.svg │ ├── sgd.pdf │ ├── sgd.png │ ├── sgd.svg │ └── sgd_example.png ├── pyproject.toml ├── setup.cfg └── src │ └── atpesc │ ├── __init__.py │ ├── common.py │ ├── notebooks │ └── statistical_learning.ipynb │ └── utils │ └── plots.py └── introduction.ipynb /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | -------------------------------------------------------------------------------- /01_deepLearning/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/.DS_Store -------------------------------------------------------------------------------- /01_deepLearning/README.md: -------------------------------------------------------------------------------- 1 | # Introduction to deep learning 2 | ATPESC 2024 3 | 4 | Author: Bethany Lusch (blusch@anl.gov), adapting materials from Marieme Ngom, Prasanna Balaprakash, Taylor Childers, Corey Adams, and Kyle Felker. 5 | 6 | This is a hands-on introduction to deep learning, a machine learning technique that tends to outperform other techniques when dealing with a large amount of data. 7 | 8 | This is a quick overview, but the goals are: 9 | - to introduce the fundamental concepts of deep learning through hands-on activities 10 | - to give you the necessary background for the more advanced topics on scaling and performance that we will teach later today. 11 | 12 | Some rough definitions: 13 | 14 | - Artificial intelligence (AI) is a set of approaches to solving complex problems by imitating the brain's ability to learn. 15 | - Machine learning (ML) is the field of study that gives computers the ability to learn without being explicitly programmed (i.e. learning patterns instead of writing down rules.) Arguably, machine learning is now a subfield of AI. 16 | 17 | 18 | Ready for more? 19 | - Here are some of our longer training materials: https://www.alcf.anl.gov/alcf-ai-science-training-series 20 | - Here's a thorough hands-on textbook: [book](https://www.oreilly.com/library/view/hands-on-machine-learning/9781492032632/) with [notebooks](https://github.com/ageron/handson-ml2). 21 | 22 | We will work on a classification problem involving the [MNIST dataset](http://yann.lecun.com/exdb/mnist/) that contains thousands of examples of handwritten numbers, with each digit labeled 0-9. The model is learning to "classify" images as one of ten classes. 23 | ![MNIST Task](images/mnist_task.png) 24 | 25 | We are going to run Jupyter notebooks. You can run them in Google Colab (see instructions [here](../README.md)). If that's a problem you can also use your own computer or ALCF's [JupyterHub](https://docs.alcf.anl.gov/services/jupyter-hub/). 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /01_deepLearning/images/LinearModel_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/LinearModel_1.png -------------------------------------------------------------------------------- /01_deepLearning/images/MaxpoolSample2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/MaxpoolSample2.png -------------------------------------------------------------------------------- /01_deepLearning/images/MnistExamples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/MnistExamples.png -------------------------------------------------------------------------------- /01_deepLearning/images/Padding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/Padding.png -------------------------------------------------------------------------------- /01_deepLearning/images/ResNet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/ResNet.png -------------------------------------------------------------------------------- /01_deepLearning/images/U-Nets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/U-Nets.png -------------------------------------------------------------------------------- /01_deepLearning/images/ViT.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/ViT.gif -------------------------------------------------------------------------------- /01_deepLearning/images/_placeholder: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /01_deepLearning/images/activation.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/activation.jpeg -------------------------------------------------------------------------------- /01_deepLearning/images/activation_functions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/activation_functions.png -------------------------------------------------------------------------------- /01_deepLearning/images/bias_vs_variance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/bias_vs_variance.png -------------------------------------------------------------------------------- /01_deepLearning/images/conv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/conv.png -------------------------------------------------------------------------------- /01_deepLearning/images/convNN_BlockDiagram.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/convNN_BlockDiagram.jpeg -------------------------------------------------------------------------------- /01_deepLearning/images/conv_layer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/conv_layer.png -------------------------------------------------------------------------------- /01_deepLearning/images/convnets-feature-maps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/convnets-feature-maps.png -------------------------------------------------------------------------------- /01_deepLearning/images/deep_nn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/deep_nn.png -------------------------------------------------------------------------------- /01_deepLearning/images/dropout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/dropout.png -------------------------------------------------------------------------------- /01_deepLearning/images/learning-rate-gradient-descent.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/learning-rate-gradient-descent.jpeg -------------------------------------------------------------------------------- /01_deepLearning/images/lr.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/lr.jpeg -------------------------------------------------------------------------------- /01_deepLearning/images/mnist_task.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/mnist_task.png -------------------------------------------------------------------------------- /01_deepLearning/images/multiple_channels.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/multiple_channels.png -------------------------------------------------------------------------------- /01_deepLearning/images/nn1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/nn1.pdf -------------------------------------------------------------------------------- /01_deepLearning/images/nn1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/nn1.png -------------------------------------------------------------------------------- /01_deepLearning/images/nonconvex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/nonconvex.png -------------------------------------------------------------------------------- /01_deepLearning/images/one_more_layer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/one_more_layer.png -------------------------------------------------------------------------------- /01_deepLearning/images/optimization_types.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/optimization_types.png -------------------------------------------------------------------------------- /01_deepLearning/images/shallow_nn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/shallow_nn.png -------------------------------------------------------------------------------- /01_deepLearning/images/test_data_rule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/test_data_rule.png -------------------------------------------------------------------------------- /01_deepLearning/images/three_layer_network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/three_layer_network.png -------------------------------------------------------------------------------- /01_deepLearning/images/tiny_network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/tiny_network.png -------------------------------------------------------------------------------- /02_dataPipelines/00_tensorflowDatasetAPI/README.md: -------------------------------------------------------------------------------- 1 | # Tensorflow Example Data Pipeline 2 | by J. Taylor Childers (jchilders@anl.gov) 3 | 4 | This is the example impliments a data pipeline for the ImageNet dataset described in the [README](../README.md) one level up. 5 | 6 | Example submission script for Polaris is provided. 7 | 8 | Submit to Polaris using: 9 | ```bash 10 | qsub -A -q submit_polaris.sh 11 | ``` 12 | 13 | All log files go into the `logdir` folder. 14 | 15 | 16 | # Profiler View 17 | 18 | You can view the processes and how they occupy the compute resources in Tensorflow using tensorboard. 19 | 20 | You can login to Polaris using: 21 | ```bash 22 | # our proxy port, must be > 1024 23 | export PORT=10001 24 | # login to theta with a port forwarding 25 | ssh -D $PORT user@polaris.alcf.anl.gov 26 | # load any conda environment that has a compatible tensorboard installation 27 | module use /soft/modulefiles 28 | module load conda 29 | conda activate 30 | cd 31 | # start tensorboard (load_fast==false is a recent setting that seems to be needed until Tensorflow work's out the bugs) 32 | tensorboard --bind_all --logdir logdir 33 | ``` 34 | Note the Port number that `tensorboard` reports when it starts up. 35 | 36 | Only 1 user can use a specific port so if you get an error choose another port number larger than `1024`. 37 | 38 | Once you have that setup. Set the Socks5 proxy of your favorite browser to host `localhost` and port `$PORT` (where `$PORT` is the value you used in the above script, like `10001`). Now in the browser URL enter the login node on which you started `tensorboard`. For instance, now you can type in `localhost:6006`. Here `6006` is the port that `tensorboard` uses by default to start up it's web service, but may vary if that port is already used so note the port reported by `tensorboard` when it starts. 39 | -------------------------------------------------------------------------------- /02_dataPipelines/00_tensorflowDatasetAPI/ilsvrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": { 3 | "handler": "ilsvrc_dataset", 4 | "batch_size": 128, 5 | "train_filelist": "/lus/eagle/projects/datasets/ImageNet/ILSVRC/ilsvrc_train_filelist.txt", 6 | "test_filelist": "/lus/eagle/projects/datasets/ImageNet/ILSVRC/ilsvrc_val_filelist.txt", 7 | "shuffle_buffer": 200000, 8 | "reshuffle_each_iteration": true, 9 | "num_parallel_readers": 8, 10 | "prefetch_buffer_size": 1, 11 | "crop_image_size": [256,256], 12 | "num_classes": 1000, 13 | "num_channels": 3 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /02_dataPipelines/00_tensorflowDatasetAPI/logdir/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /02_dataPipelines/00_tensorflowDatasetAPI/submit_polaris.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #PBS -l select=1 3 | #PBS -l walltime=00:20:00 4 | #PBS -l filesystems=eagle:home_fs 5 | #PBS -q debug 6 | #PBS -o logdir/ 7 | #PBS -e logdir/ 8 | 9 | cd $PBS_O_WORKDIR 10 | 11 | echo [$SECONDS] setup conda environment 12 | module use /soft/modulefiles 13 | module load conda 14 | conda activate 15 | 16 | echo [$SECONDS] python = $(which python) 17 | echo [$SECONDS] python version = $(python --version) 18 | 19 | echo [$SECONDS] setup local env vars 20 | NODES=`cat $PBS_NODEFILE | wc -l` 21 | GPUS_PER_NODE=4 22 | RANKS=$((NODES * GPUS_PER_NODE)) 23 | echo NODES=$NODES GPUS_PER_NODE=$GPUS_PER_NODE RANKS=$RANKS 24 | 25 | export OMP_NUM_THREADS=1 26 | echo [$SECONDS] run example with $OMP_NUM_THREADS threads 27 | mpiexec -n $RANKS --ppn $GPUS_PER_NODE --depth=$OMP_NUM_THREADS --cpu-bind depth --env OMP_NUM_THREADS=$OMP_NUM_THREADS -env OMP_PLACES=threads \ 28 | python ilsvrc_dataset.py -c ilsvrc.json --interop $OMP_NUM_THREADS --intraop $OMP_NUM_THREADS \ 29 | --logdir logdir 30 | 31 | export OMP_NUM_THREADS=16 32 | echo [$SECONDS] run example with $OMP_NUM_THREADS threads 33 | mpiexec -n $RANKS --ppn $GPUS_PER_NODE --depth=$OMP_NUM_THREADS --cpu-bind depth --env OMP_NUM_THREADS=$OMP_NUM_THREADS -env OMP_PLACES=threads \ 34 | python ilsvrc_dataset.py -c ilsvrc.json --interop $OMP_NUM_THREADS --intraop $OMP_NUM_THREADS \ 35 | --logdir logdir 36 | 37 | export OMP_NUM_THREADS=64 38 | echo [$SECONDS] run example with $OMP_NUM_THREADS threads 39 | mpiexec -n $RANKS --ppn $GPUS_PER_NODE --depth=$OMP_NUM_THREADS --cpu-bind depth --env OMP_NUM_THREADS=$OMP_NUM_THREADS -env OMP_PLACES=threads \ 40 | python ilsvrc_dataset.py -c ilsvrc.json --interop $OMP_NUM_THREADS --intraop $OMP_NUM_THREADS \ 41 | --logdir logdir 42 | 43 | echo [$SECONDS] done 44 | -------------------------------------------------------------------------------- /02_dataPipelines/01_pytorchDatasetAPI/imagenet_serial.py: -------------------------------------------------------------------------------- 1 | # This example builds a serial data pipeline to use as an example 2 | import os 3 | import time 4 | import argparse 5 | import datetime 6 | import time 7 | total_start = time.time() 8 | from PIL import Image 9 | import numpy as np 10 | import torch 11 | from torchvision import transforms, models 12 | 13 | 14 | # simple class to calculate mean and standard deviation 15 | class MeanCalc: 16 | def __init__(self): 17 | self.sum = 0 18 | self.sum2 = 0 19 | self.n = 0 20 | 21 | def add(self, x): 22 | self.sum += x 23 | self.sum2 += x * x 24 | self.n += 1 25 | 26 | def mean(self): 27 | return self.sum / self.n 28 | 29 | def stddev(self): 30 | return np.sqrt(self.sum2 / self.n - self.mean()*self.mean()) 31 | 32 | def __str__(self): 33 | return f'mean: {self.mean():.2f}, stddev: {self.stddev():.2f}' 34 | 35 | # dataset handler for input files 36 | class ImageNetDataset: 37 | def __init__(self, base_dir, file_list_path, id_to_index, transform=None): 38 | self.base_dir = base_dir 39 | self.transform = transform 40 | self.id_to_index = id_to_index 41 | with open(file_list_path, 'r') as file: 42 | self.image_paths = [line.strip() for line in file] 43 | 44 | def __len__(self): 45 | return len(self.image_paths) 46 | 47 | def load_image(self, idx): 48 | img_path = self.image_paths[idx] 49 | img = Image.open(img_path).convert('RGB') 50 | unique_id = img_path.split('/')[-2] 51 | target = self.id_to_index[unique_id] 52 | if self.transform: 53 | img = self.transform(img) 54 | return img, target 55 | 56 | # the filenames contain a unique id for each image that corresponds to the object ID 57 | # create a hash table for labels from string to int 58 | def build_id_to_index_mapping(file_list_path): 59 | unique_ids = set() 60 | with open(file_list_path, 'r') as file: 61 | for line in file: 62 | unique_id = line.strip().split('/')[-2] 63 | unique_ids.add(unique_id) 64 | return {unique_id: idx for idx, unique_id in enumerate(sorted(unique_ids))} 65 | 66 | # transform to resize and convert to tensor 67 | transform = transforms.Compose([ 68 | transforms.Resize((256, 256)), 69 | transforms.ToTensor(), 70 | ]) 71 | 72 | def main(): 73 | 74 | # parse arguments 75 | parser = argparse.ArgumentParser() 76 | parser.add_argument('-b','--nbatch', type=int, default=64) 77 | parser.add_argument('-s','--nsteps', type=int, default=20) 78 | parser.add_argument('--profile', action='store_true',default=False) 79 | parser.add_argument('--base-dir', type=str, default='/lus/eagle/projects/datasets/ImageNet/ILSVRC') 80 | parser.add_argument('--file-list-path', type=str, default='ilsvrc_train_filelist.txt') 81 | parser.add_argument('--status-print-interval', type=int, default=5) 82 | args = parser.parse_args() 83 | 84 | # setup dataset 85 | base_dir = args.base_dir 86 | file_list_path = os.path.join(base_dir, args.file_list_path) 87 | id_to_index = build_id_to_index_mapping(file_list_path) 88 | dataset = ImageNetDataset(base_dir, file_list_path, id_to_index, transform=transform) 89 | 90 | # run settings 91 | batch_size = args.nbatch 92 | print(f'Batch size: {batch_size}') 93 | total_steps = args.nsteps 94 | status_print_interval = args.status_print_interval 95 | profile = args.profile 96 | 97 | # set device to gpu if available 98 | device = torch.device(f'cuda:0' if torch.cuda.is_available() else "cpu") 99 | 100 | # Create a model to use as an example 101 | model = models.resnet50(weights='IMAGENET1K_V1') 102 | model.to(device) 103 | criterion = torch.nn.CrossEntropyLoss().to(device) 104 | optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9) 105 | model.train() 106 | 107 | # set profile path to be ./logdir/profiler/ + date-time 108 | log_path = os.path.join('./logdir', datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) 109 | 110 | # create pytorch profiler that outputs TensorBoard logs 111 | prof = torch.profiler.profile( 112 | schedule=torch.profiler.schedule(wait=20, warmup=1, active=20, repeat=1), 113 | activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA], 114 | # on_trace_ready=torch.profiler.tensorboard_trace_handler(log_path), 115 | record_shapes=True, 116 | profile_memory=True, 117 | with_stack=True 118 | ) 119 | 120 | # start the profiler 121 | if profile: prof.start() 122 | 123 | step_time = time.time() 124 | step = 0 125 | image_rate = MeanCalc() 126 | 127 | # training loop 128 | while step < total_steps: 129 | images, targets = [], [] 130 | # build an input batch serially 131 | for _ in range(batch_size): 132 | if step * batch_size + _ >= len(dataset): 133 | break 134 | img, target = dataset.load_image(step * batch_size + _) 135 | images.append(img) 136 | targets.append(target) 137 | if len(images) == 0: 138 | break 139 | # convert to pytorch tensors 140 | images = torch.stack(images).to(device) 141 | targets = torch.tensor(targets).to(device) 142 | step += 1 143 | 144 | # pass the batch through the model 145 | outputs = model(images) 146 | loss = criterion(outputs, targets) 147 | optimizer.zero_grad() 148 | loss.backward() 149 | optimizer.step() 150 | 151 | if profile: prof.step() 152 | 153 | # print status 154 | if step % status_print_interval == 0: 155 | step_img_rate = status_print_interval * batch_size / (time.time() - step_time) 156 | print(f'step: {step}; step_img_rate: {step_img_rate:.2f}') 157 | if step > 5: 158 | image_rate.add(step_img_rate) 159 | step_time = time.time() 160 | 161 | if profile: prof.stop() 162 | if profile: prof.export_chrome_trace("imagenet_serial.json") 163 | print(f'Average image rate: {str(image_rate)}') 164 | 165 | print(f'All Done; total runtime: {time.time() - total_start:.2f}') 166 | 167 | if __name__ == '__main__': 168 | main() 169 | -------------------------------------------------------------------------------- /02_dataPipelines/01_pytorchDatasetAPI/logdir/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except this file 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /02_dataPipelines/01_pytorchDatasetAPI/submit_polaris.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #PBS -l select=1 3 | #PBS -l walltime=00:20:00 4 | #PBS -l filesystems=eagle:home_fs 5 | #PBS -q debug 6 | #PBS -o logdir/ 7 | #PBS -e logdir/ 8 | 9 | cd $PBS_O_WORKDIR 10 | 11 | echo [$SECONDS] setup conda environment 12 | module use /soft/modulefiles 13 | module load conda 14 | conda activate 15 | 16 | echo [$SECONDS] python = $(which python) 17 | echo [$SECONDS] python version = $(python --version) 18 | 19 | echo [$SECONDS] setup local env vars 20 | NODES=`cat $PBS_NODEFILE | wc -l` 21 | GPUS_PER_NODE=4 22 | RANKS=$((NODES * GPUS_PER_NODE)) 23 | echo NODES=$NODES GPUS_PER_NODE=$GPUS_PER_NODE RANKS=$RANKS 24 | 25 | # for PyTorch DDT setup 26 | export MASTER_ADDR="localhost" 27 | export MASTER_PORT=12399 28 | echo [$SECONDS] MASTER_ADDR=$MASTER_ADDR MASTER_PORT=$MASTER_PORT 29 | 30 | BATCH_SIZE=64 31 | NSTEPS=100 32 | PROFILE=--profile 33 | # PROFILE= 34 | echo [$SECONDS] using batch size $BATCH_SIZE and $NSTEPS steps 35 | 36 | echo [$SECONDS] run serial example 37 | python imagenet_serial.py -b $BATCH_SIZE -s $NSTEPS $PROFILE 38 | 39 | 40 | NWORKERS=1 41 | echo [$SECONDS] run parallel with $NWORKERS workers 42 | mpiexec -n $RANKS --ppn $GPUS_PER_NODE --depth=16 --cpu-bind depth python imagenet_parallel.py -b $BATCH_SIZE -s $NSTEPS -w $NWORKERS $PROFILE 43 | 44 | NWORKERS=2 45 | echo [$SECONDS] run parallel with $NWORKERS workers 46 | mpiexec -n $RANKS --ppn $GPUS_PER_NODE --depth=16 --cpu-bind depth python imagenet_parallel.py -b $BATCH_SIZE -s $NSTEPS -w $NWORKERS $PROFILE 47 | 48 | NWORKERS=3 49 | echo [$SECONDS] run parallel with $NWORKERS workers 50 | mpiexec -n $RANKS --ppn $GPUS_PER_NODE --depth=16 --cpu-bind depth python imagenet_parallel.py -b $BATCH_SIZE -s $NSTEPS -w $NWORKERS $PROFILE 51 | 52 | NWORKERS=4 53 | echo [$SECONDS] run parallel with $NWORKERS workers 54 | mpiexec -n $RANKS --ppn $GPUS_PER_NODE --depth=16 --cpu-bind depth python imagenet_parallel.py -b $BATCH_SIZE -s $NSTEPS -w $NWORKERS $PROFILE 55 | 56 | echo [$SECONDS] done 57 | -------------------------------------------------------------------------------- /02_dataPipelines/README.md: -------------------------------------------------------------------------------- 1 | # Building a CPU-side data pipeline 2 | 3 | Author: J. Taylor Childers (jchilders@anl.gov) 4 | 5 | ## Learning Goals: 6 | - Using the CPU on a system to build data batches in parallel as ML calculations are performed on the GPU 7 | - Using parallel processes on CPU to speed up the data pipeline process. 8 | - Do all this using the framework's (PyTorch, Tensorflow) data APIs 9 | 10 | New AI systems largely depend on CPU-GPU hybrid architectures. This makes efficient use of CPU-side resources important in order to feed sufficient data to the GPU algorithms. Ideally, the CPU processes data and builds training batches, while the GPU performs the compute intensive forward and backward gradient calculations. 11 | 12 | Here there are examples of building a data pipeline for both Tensorflow and PyTorch. Tensorflow's data pipeline API is a bit more advanced than PyTorch so we'll focus on that one, though we include an example in PyTorch. 13 | 14 | # ImageNet Dataset 15 | 16 | This example uses the ImageNet dataset to build training batches. 17 | 18 | ![Turtle](images/n01667778_12001.JPEG) ![Dog](images/n02094114_1205.JPEG) 19 | 20 | This dataset includes JPEG images and an XML annotation for each file that defines a bounding box for each class. Building a training batch requires pre-processing the images and annotations. In our example, we have created text files that list all the files in the training set and validation set. For each text file, we need to use the input JPEG files and build tensors that include multiple images per training batch. 21 | 22 | # Tensorflow Dataset example 23 | 24 | Tensorflow has some very nice tools to help us build the pipeline. You'll find the [example here](00_tensorflowDatasetAPI/ilsvrc_dataset.py). 25 | 26 | ## Build from file list 27 | We'll start in the function `build_dataset_from_filelist`. 28 | 29 | 1. Open the filelist 30 | ```python 31 | # loading full filelist 32 | filelist = [] 33 | with open(filelist_filename) as file: 34 | for line in file: 35 | filelist.append(line.strip()) 36 | ``` 37 | 2. Parse the list of files into a TF Tensor 38 | ```python 39 | filelist = tf.data.Dataset.from_tensor_slices(filelist) 40 | ``` 41 | 3. If we are using Horovod for MPI parallelism, we want to "shard" the data across nodes so each node processes unique data 42 | ```python 43 | filelist = filelist.shard(config['hvd'].size(), config['hvd'].rank()) 44 | ``` 45 | 4. Shuffle our filelist at each epoch barrier 46 | ```python 47 | filelist = filelist.shuffle(dc['shuffle_buffer'],reshuffle_each_iteration=dc['reshuffle_each_iteration']) 48 | ``` 49 | 5. Run a custom function on the filelist, which effectively opens the JPEG file, loads the data into a TF Tensor and extracts the class labels. If there are multiple objects in the image, this function will return more than one image using the bounding boxes. `num_parallel_calls` allows this function to run in parallel so many JPEG files can be read into memory and processed in parallel threads. 50 | ```python 51 | ds = filelist.map(load_image_label_bb, 52 | num_parallel_calls=tf.data.experimental.AUTOTUNE) 53 | ``` 54 | 6. Since the previous map fuction may return one or more images, we need to unbatch the output before we batch it into our fixed batch size 55 | ```python 56 | ds = ds.apply(tf.data.Dataset.unbatch) 57 | ds = ds.batch(dc['batch_size']) 58 | ``` 59 | 7. Tell the dataset it can prepare the next batch(es) prior to them being requested 60 | ```python 61 | ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) 62 | ``` 63 | 64 | Done. 65 | 66 | We can now iterate over this dataset in a loop: 67 | ```python 68 | for inputs,labels in ds: 69 | prediction = model(inputs) 70 | loss = loss_func(prediction,labels) 71 | # ... 72 | ``` 73 | 74 | ## Parallel Processing on Polaris 75 | 76 | The example `00_tensorflowDatasetAPI/ilsvrc_dataset.py` can be run via 77 | ```bash 78 | cd 00_tensorflowDatasetAPI 79 | qsub -A -q debug submit_polaris.sh 80 | ``` 81 | 82 | This script will run the example 3 times with 1 thread (no parallelism), 16 threads, and 64 threads per MPI process. The reported `imgs/sec` throughput will be lowest for serial processing and highest for the 64 threads per MPI process. You can see in this screenshot from the [Tensorflow Profiler](https://www.tensorflow.org/tensorboard/tensorboard_profiling_keras) how processes are being utilized. 83 | 84 | This profile shows the single process handling all the data pipeline processes. All data pipeline calls are being done serially when they could be done in parallel. It takes over 3 seconds to prepare a batch of images. 85 | ![serial](images/ilsvrc_serial.png) 86 | 87 | In the case of 64-threads per MPI process, batch processing time is down to 0.08 seconds. The profiler shows we are running with our 64 parallel processes, all of which are opening JPEGs, processing them into tensors, extracting truth information, and so on. Once can see the `ReadFile` operation taking place in parallel which opens the jpeg and reads in the data to memory. This operation is the most time consuming in this pipeline and by parallelizing it, we have improved our throughput. 88 | ![parallel](images/ilsvrc_64threads_zoom.png) 89 | 90 | 91 | -------------------------------------------------------------------------------- /02_dataPipelines/images/ilsvrc_64threads.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/02_dataPipelines/images/ilsvrc_64threads.png -------------------------------------------------------------------------------- /02_dataPipelines/images/ilsvrc_64threads_zoom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/02_dataPipelines/images/ilsvrc_64threads_zoom.png -------------------------------------------------------------------------------- /02_dataPipelines/images/ilsvrc_parallel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/02_dataPipelines/images/ilsvrc_parallel.png -------------------------------------------------------------------------------- /02_dataPipelines/images/ilsvrc_serial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/02_dataPipelines/images/ilsvrc_serial.png -------------------------------------------------------------------------------- /02_dataPipelines/images/ilsvrc_serial_zoom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/02_dataPipelines/images/ilsvrc_serial_zoom.png -------------------------------------------------------------------------------- /02_dataPipelines/images/n01667778_12001.JPEG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/02_dataPipelines/images/n01667778_12001.JPEG -------------------------------------------------------------------------------- /02_dataPipelines/images/n02094114_1205.JPEG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/02_dataPipelines/images/n02094114_1205.JPEG -------------------------------------------------------------------------------- /02_dataPipelines/images/pytorch_threading.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/02_dataPipelines/images/pytorch_threading.png -------------------------------------------------------------------------------- /03_introlangmodels/README.md: -------------------------------------------------------------------------------- 1 | # Introduction to Language models 2 | 3 | Author: Archit Vasan (avasan@anl.gov), including and adapting materials and discussions over time by Varuni Sastri, Carlo Graziani, Taylor Childers, Venkat Vishwanath, Jay Alammar and Kevin Gimpel. 4 | 5 | This tutorial introduces sequential data modeling, tokenization methods and embeddings and attempts to demystify aspects of the Transformer model architecture. 6 | 7 | We will refer to this notebook: 8 | 9 | [Introduction to language models](https://github.com/argonne-lcf/ATPESC_MachineLearning/edit/master/03_introlangmodels/03_languagemodels.ipynb) 10 | 11 | The discussion will include: 12 | * tokenization 13 | * token embedding 14 | * positional encodings, 15 | * attention mechanisms, 16 | * output layers, 17 | * and training loops. 18 | 19 | We are first going to introduce sequential data modeling, tokenization and then use "text-generation" using the popular GPT-2 model and the Hugging Face pipeline. Then we are going to code the model elements of a simple LLM from scratch and train this ourselves. 20 | 21 | ## Environment Setup 22 | 1. If you are using ALCF, first log in. From a terminal run the following command: 23 | ``` 24 | ssh username@polaris.alcf.anl.gov 25 | ``` 26 | 27 | 2. We will be downloading data in our Jupyter notebook, which runs on hardware that by default has no Internet access. From the terminal on Polaris, edit the ~/.bash_profile file to have these proxy settings: 28 | ``` 29 | export HTTP_PROXY="http://proxy-01.pub.alcf.anl.gov:3128" 30 | export HTTPS_PROXY="http://proxy-01.pub.alcf.anl.gov:3128" 31 | export http_proxy="http://proxy-01.pub.alcf.anl.gov:3128" 32 | export https_proxy="http://proxy-01.pub.alcf.anl.gov:3128" 33 | export ftp_proxy="http://proxy-01.pub.alcf.anl.gov:3128" 34 | export no_proxy="admin,polaris-adminvm-01,localhost,*.cm.polaris.alcf.anl.gov,polaris-*,*.polaris.alcf.anl.gov,*.alcf.anl.gov" 35 | ``` 36 | 37 | 4. Now that we have the updated notebooks, we can open them. If you are using ALCF JupyterHub or Google Colab, you can be reminded of the steps [here](https://github.com/argonne-lcf/ai-science-training-series/blob/main/01_intro_AI_on_Supercomputer/01_linear_regression_sgd.ipynb). 38 | 39 | 5. Reminder: Change the notebook's kernel to `datascience/conda-2023-01-10` (you may need to change kernel each time you open a notebook for the first time): 40 | 41 | 1. select *Kernel* in the menu bar 42 | 2. select *Change kernel...* 43 | 3. select *datascience/conda-2024-04-29* from the drop-down menu 44 | 45 | ## __Exciting example:__ 46 | 47 | Here is an image of GenSLM described earlier by Arvind Ramanathan. This is a language model that can model genomic information in a single model. It was shown to model the evolution of SARS-COV2 without expensive experiments. 48 | ![GenSLM](images/genslm.png) 49 | 50 | ## __References:__ 51 | 52 | Here are some recommendations for further reading and additional code for review. 53 | 54 | * ["The Illustrated Transformer"](https://jalammar.github.io/illustrated-transformer/) by Jay Alammar 55 | * ["Visualizing A Neural Machine Translation Model (Mechanics of Seq2seq Models With Attention)"](https://jalammar.github.io/visualizing-neural-machine-translation-mechanics-of-seq2seq-models-with-attention/) 56 | * ["The Illustrated GPT-2 (Visualizing Transformer Language Models)"](https://jalammar.github.io/illustrated-gpt2/) 57 | * ["LLM Tutorial Workshop (Argonne National Laboratory)"](https://github.com/brettin/llm_tutorial/tree/main) 58 | * ["LLM Tutorial Workshop Part 2 (Argonne National Laboratory)"](https://github.com/argonne-lcf/llm-workshop) 59 | 60 | 61 | -------------------------------------------------------------------------------- /03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_input.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_input.txt -------------------------------------------------------------------------------- /03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_input.txt.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_input.txt.lock -------------------------------------------------------------------------------- /03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_test_input.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_test_input.txt -------------------------------------------------------------------------------- /03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_test_input.txt.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_test_input.txt.lock -------------------------------------------------------------------------------- /03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_train_input.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_train_input.txt -------------------------------------------------------------------------------- /03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_train_input.txt.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_train_input.txt.lock -------------------------------------------------------------------------------- /03_introlangmodels/images/Attention_Vis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/Attention_Vis.png -------------------------------------------------------------------------------- /03_introlangmodels/images/BERT_Explanation.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/BERT_Explanation.webp -------------------------------------------------------------------------------- /03_introlangmodels/images/BERT_input_sent.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/BERT_input_sent.webp -------------------------------------------------------------------------------- /03_introlangmodels/images/Byte_Pair_enc.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/Byte_Pair_enc.webp -------------------------------------------------------------------------------- /03_introlangmodels/images/Graphformer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/Graphformer.png -------------------------------------------------------------------------------- /03_introlangmodels/images/LLM_Architectures.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/LLM_Architectures.webp -------------------------------------------------------------------------------- /03_introlangmodels/images/LLM_Blackbox.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/LLM_Blackbox.png -------------------------------------------------------------------------------- /03_introlangmodels/images/LLM_Theoret_Apps.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/LLM_Theoret_Apps.pptx -------------------------------------------------------------------------------- /03_introlangmodels/images/Lorem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/Lorem.png -------------------------------------------------------------------------------- /03_introlangmodels/images/Postitional_Embedding.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/Postitional_Embedding.webp -------------------------------------------------------------------------------- /03_introlangmodels/images/Protein-Structure-06.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/Protein-Structure-06.png -------------------------------------------------------------------------------- /03_introlangmodels/images/RNA-codons.svg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/RNA-codons.svg.png -------------------------------------------------------------------------------- /03_introlangmodels/images/The_transformer_encoder_decoder_stack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/The_transformer_encoder_decoder_stack.png -------------------------------------------------------------------------------- /03_introlangmodels/images/The_transformer_encoders_decoders.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/The_transformer_encoders_decoders.png -------------------------------------------------------------------------------- /03_introlangmodels/images/Transformer_Arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/Transformer_Arch.png -------------------------------------------------------------------------------- /03_introlangmodels/images/Transformer_Enc_Dec_Blocks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/Transformer_Enc_Dec_Blocks.png -------------------------------------------------------------------------------- /03_introlangmodels/images/Transformer_decoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/Transformer_decoder.png -------------------------------------------------------------------------------- /03_introlangmodels/images/WordPieceTok.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/WordPieceTok.webp -------------------------------------------------------------------------------- /03_introlangmodels/images/attention_is_all_you_need.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/attention_is_all_you_need.png -------------------------------------------------------------------------------- /03_introlangmodels/images/chars-tokenization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/chars-tokenization.png -------------------------------------------------------------------------------- /03_introlangmodels/images/decoder_only_block.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/decoder_only_block.png -------------------------------------------------------------------------------- /03_introlangmodels/images/encode_decode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/encode_decode.png -------------------------------------------------------------------------------- /03_introlangmodels/images/encoder_with_tensors_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/encoder_with_tensors_2.png -------------------------------------------------------------------------------- /03_introlangmodels/images/genslm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/genslm.png -------------------------------------------------------------------------------- /03_introlangmodels/images/gpt-2-layers-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/gpt-2-layers-2.png -------------------------------------------------------------------------------- /03_introlangmodels/images/gpt2-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/gpt2-output.png -------------------------------------------------------------------------------- /03_introlangmodels/images/gpt2-self-attention-example-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/gpt2-self-attention-example-2.png -------------------------------------------------------------------------------- /03_introlangmodels/images/one-hot-vocabulary-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/one-hot-vocabulary-example.png -------------------------------------------------------------------------------- /03_introlangmodels/images/output_target_probability_distributions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/output_target_probability_distributions.png -------------------------------------------------------------------------------- /03_introlangmodels/images/positional_encoding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/positional_encoding.png -------------------------------------------------------------------------------- /03_introlangmodels/images/recurrent_nn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/recurrent_nn.png -------------------------------------------------------------------------------- /03_introlangmodels/images/rnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/rnn.png -------------------------------------------------------------------------------- /03_introlangmodels/images/self-attention-and-masked-self-attention.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/self-attention-and-masked-self-attention.png -------------------------------------------------------------------------------- /03_introlangmodels/images/self-attention-example-folders-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/self-attention-example-folders-3.png -------------------------------------------------------------------------------- /03_introlangmodels/images/self-attention-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/self-attention-output.png -------------------------------------------------------------------------------- /03_introlangmodels/images/text-processing---machines-vs-humans.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/text-processing---machines-vs-humans.png -------------------------------------------------------------------------------- /03_introlangmodels/images/the_transformer_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/the_transformer_3.png -------------------------------------------------------------------------------- /03_introlangmodels/images/tokenize.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/tokenize.png -------------------------------------------------------------------------------- /03_introlangmodels/images/tokenize_words.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/tokenize_words.png -------------------------------------------------------------------------------- /03_introlangmodels/images/transformer-decoder-intro.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer-decoder-intro.png -------------------------------------------------------------------------------- /03_introlangmodels/images/transformer_attention_heads_z.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_attention_heads_z.png -------------------------------------------------------------------------------- /03_introlangmodels/images/transformer_decoder_output_softmax (1).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_decoder_output_softmax (1).png -------------------------------------------------------------------------------- /03_introlangmodels/images/transformer_decoder_output_softmax.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_decoder_output_softmax.png -------------------------------------------------------------------------------- /03_introlangmodels/images/transformer_decoding_1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_decoding_1.gif -------------------------------------------------------------------------------- /03_introlangmodels/images/transformer_decoding_2.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_decoding_2.gif -------------------------------------------------------------------------------- /03_introlangmodels/images/transformer_logits_output_and_label.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_logits_output_and_label.png -------------------------------------------------------------------------------- /03_introlangmodels/images/transformer_multi-headed_self-attention-recap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_multi-headed_self-attention-recap.png -------------------------------------------------------------------------------- /03_introlangmodels/images/transformer_positional_encoding_vectors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_positional_encoding_vectors.png -------------------------------------------------------------------------------- /03_introlangmodels/images/transformer_self-attention_visualization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_self-attention_visualization.png -------------------------------------------------------------------------------- /03_introlangmodels/images/transformer_self-attention_visualization_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_self-attention_visualization_3.png -------------------------------------------------------------------------------- /03_introlangmodels/images/vision-transformer-vit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/vision-transformer-vit.png -------------------------------------------------------------------------------- /03_introlangmodels/images/viz-bert-voc-verbs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/viz-bert-voc-verbs.png -------------------------------------------------------------------------------- /03_introlangmodels/images/wordembedding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/wordembedding.png -------------------------------------------------------------------------------- /03_profileLearning/.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | *.lprof 3 | *.prof 4 | *.h5 5 | *.pb 6 | *.gz 7 | */logdir/* 8 | -------------------------------------------------------------------------------- /03_profileLearning/README.md: -------------------------------------------------------------------------------- 1 | # Profiling TensorFlow 2 | 3 | _Authors_: Kyle Felker (felker@anl.gov), Corey Adams (corey.adams@anl.gov) 4 | 5 | 6 | In this example, we'll profile the CNN used to clasify MNIST digits in the previous 7 | exercises. We will complete several rounds of profiling, each time enabling a new tool or 8 | optimization. At the end of the exercise, you'll have a much faster network. 9 | 10 | Find the original script in `train_MNIST.py`. 11 | 12 | All the scripts used here work in a Singularity container containing TensorFlow 2.5.0. 13 | 14 | ```bash 15 | singularity exec --nv -B /lus /lus/theta-fs0/software/thetagpu/nvidia-containers/tensorflow2/tf2_21.07-py3.simg bash 16 | ``` 17 | 18 | This walkthrough was adapted from an earlier tutorial from the May 2021 ALCF Computational Performance Workshop, which used a Generative Adversial Network (GAN) for the test code: [CPW21: Profiling TensorFlow](https://github.com/argonne-lcf/CompPerfWorkshop-2021/tree/main/09_profiling_frameworks/TensorFlow). 19 | A separate tutorial for profiling PyTorch codes (using a ResNet model) is also available from that workshop: [CPW21: Profiling PyTorch](https://github.com/argonne-lcf/CompPerfWorkshop-2021/tree/main/09_profiling_frameworks/PyTorchProfiler) 20 | 21 | 22 | ## A Starting Point 23 | 24 | To download the MNIST dataset, make sure to enable HTTP forwarding: 25 | ```bash 26 | export http_proxy=http://theta-proxy.tmi.alcf.anl.gov:3128 27 | export https_proxy=https://theta-proxy.tmi.alcf.anl.gov:3128 28 | ``` 29 | 30 | Run the original script, single node, like so: `python train_MNIST.py`. Feel free to ctrl+C once it hits a stable throughput. 31 | 32 | Take note of the throughput reported! 33 | 34 | ``` 35 | 2021-08-02 21:49:36,778 - INFO - (0, 292), Loss: 0.109, step_time: 0.271, throughput: 235.822 img/s. 36 | 2021-08-02 21:49:37,050 - INFO - (0, 293), Loss: 0.129, step_time: 0.271, throughput: 235.804 img/s. 37 | 2021-08-02 21:49:37,321 - INFO - (0, 294), Loss: 0.022, step_time: 0.271, throughput: 236.466 img/s. 38 | 2021-08-02 21:49:37,593 - INFO - (0, 295), Loss: 0.073, step_time: 0.272, throughput: 235.060 img/s. 39 | 2021-08-02 21:49:37,865 - INFO - (0, 296), Loss: 0.026, step_time: 0.271, throughput: 235.941 img/s. 40 | 2021-08-02 21:49:38,136 - INFO - (0, 297), Loss: 0.042, step_time: 0.271, throughput: 236.474 img/s. 41 | 2021-08-02 21:49:38,407 - INFO - (0, 298), Loss: 0.054, step_time: 0.271, throughput: 236.156 img/s. 42 | 2021-08-02 21:49:38,679 - INFO - (0, 299), Loss: 0.132, step_time: 0.272, throughput: 235.603 img/s. 43 | 2021-08-02 21:49:38,951 - INFO - (0, 300), Loss: 0.091, step_time: 0.271, throughput: 235.760 img/s. 44 | 2021-08-02 21:49:39,222 - INFO - (0, 301), Loss: 0.024, step_time: 0.271, throughput: 236.121 img/s. 45 | 2021-08-02 21:49:39,494 - INFO - (0, 302), Loss: 0.229, step_time: 0.271, throughput: 235.878 img/s. 46 | ``` 47 | 48 | On average, the A100 system is moving about 237 Images / second through this training loop. Let's dig in to the first optimization in the [`line_profiler/`](./line_profiler/) subdirectory. 49 | 50 | Below are the wrap up conclusions which you can read ahead or come back to later. 51 | 52 | # Conclusions 53 | 54 | Try the `optimized` version of the code - what throughput are you getting? It should be a good deal faster! (1.05 million Img/s - about 4200x faster) So, after all the profiling, what optimizations did we learn? 55 | 56 | - Make sure that IO isn't a bottleneck. In this case, the fix for this bottleneck was simple. With big datasets it can be a signficant challenge to keep the GPU fed and not idle on IO. 57 | - Make sure to use graph compilation where you can. It's easy to make mistakes here: you must make sure to use only TensorFlow operations! 58 | - Use XLA, if your training loops have more work in them than a simple MNIST CNN. It can give excellent speed ups by fusing operations. 59 | - Use reduced or mixed precision, again if your training loops have more local work than the example here. Reduced precision becomes particularly powerful when XLA is involved, allowing you to keep the Tensor Cores chugging along with less memory-bound operations. 60 | 61 | In general, if you have an application running in TensoFlow, it's a great idea to profile periodically and make sure you've got all the basic optimizations down! 62 | 63 | # Comparison to GAN example 64 | 65 | As mentioned above, a very similar walkthrough based on a Generative Adversial Network (GAN) is available here: [CPW21: Profiling TensorFlow](https://github.com/argonne-lcf/CompPerfWorkshop-2021/tree/main/09_profiling_frameworks/TensorFlow). You are encouraged to compare the results from that tutorial to the lessons learned here. Despite very similar source code, the performance behavior differs from this CNN in some key aspects: 66 | - The optimized GAN only reaches 137k images/second 67 | - XLA is very important to achieving the optimal performance at this throughput range. 68 | - Mixed precision nets about +40k images/second on the performance over XLA 69 | - The CNN actually has more trainable parameters and is in some sense a "larger" model compared to the GAN, yet is much faster to train. All the extra “GAN-like” operations in the `forward_pass()` function (generate fake images, extra forward pass through the discriminator, label softening, randomly flipping the labels, etc.) cause the 5-10x slowdown relative to a standard feedforward CNN. 70 | -------------------------------------------------------------------------------- /03_profileLearning/line_profiler/train_MNIST.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import argparse 4 | import logging 5 | from logging import handlers 6 | 7 | import tensorflow as tf 8 | import numpy 9 | 10 | import horovod.tensorflow as hvd 11 | 12 | 13 | def init_mpi(): 14 | # Using the presence of an env variable to determine if we're using MPI: 15 | try: 16 | hvd.init() 17 | return hvd.rank(), hvd.size() 18 | except: 19 | if "mpirun" in sys.argv or "mpiexec" in sys.argv: 20 | raise Exception("MPI detected in command line but was not able to init!") 21 | return 0, 1 22 | 23 | 24 | def configure_logger(rank): 25 | '''Configure a global logger 26 | 27 | Adds a stream handler and a file hander, buffers to file (10 lines) but not to stdout. 28 | 29 | Submit the MPI Rank 30 | 31 | ''' 32 | logger = logging.getLogger() 33 | 34 | # Create a handler for STDOUT, but only on the root rank. 35 | # If not distributed, we still get 0 passed in here. 36 | if rank == 0: 37 | stream_handler = logging.StreamHandler() 38 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 39 | stream_handler.setFormatter(formatter) 40 | handler = handlers.MemoryHandler(capacity=0, target=stream_handler) 41 | logger.addHandler(handler) 42 | 43 | # Add a file handler too: 44 | log_file = "process.log" 45 | file_handler = logging.FileHandler(log_file) 46 | file_handler.setFormatter(formatter) 47 | file_handler = handlers.MemoryHandler(capacity=10, target=file_handler) 48 | logger.addHandler(file_handler) 49 | 50 | logger.setLevel(logging.INFO) 51 | else: 52 | # in this case, MPI is available but it's not rank 0 53 | # create a null handler 54 | handler = logging.NullHandler() 55 | logger.addHandler(handler) 56 | logger.setLevel(logging.INFO) 57 | 58 | 59 | class MNISTClassifier(tf.keras.models.Model): 60 | 61 | def __init__(self, activation=tf.nn.tanh): 62 | tf.keras.models.Model.__init__(self) 63 | 64 | self.conv_1 = tf.keras.layers.Conv2D(32, [3, 3], activation='relu') 65 | self.conv_2 = tf.keras.layers.Conv2D(64, [3, 3], activation='relu') 66 | self.pool_3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2)) 67 | self.drop_4 = tf.keras.layers.Dropout(0.25) 68 | self.dense_5 = tf.keras.layers.Dense(128, activation='relu') 69 | self.drop_6 = tf.keras.layers.Dropout(0.5) 70 | self.dense_7 = tf.keras.layers.Dense(10, activation='softmax') 71 | 72 | def call(self, inputs): 73 | ''' 74 | Reshape at input and output: 75 | ''' 76 | # batch_size = inputs.shape[0] 77 | 78 | x = self.conv_1(inputs) 79 | x = self.conv_2(x) 80 | x = self.pool_3(x) 81 | x = self.drop_4(x) 82 | x = tf.keras.layers.Flatten()(x) 83 | x = self.dense_5(x) 84 | x = self.drop_6(x) 85 | x = self.dense_7(x) 86 | 87 | return x 88 | 89 | 90 | def compute_loss(y_true, y_pred): 91 | # if labels are integers, use sparse categorical crossentropy 92 | # network's final layer is softmax, so from_logtis=False 93 | scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False) 94 | # if labels are one-hot encoded, use standard crossentropy 95 | 96 | return scce(y_true, y_pred) # .numpy() 97 | 98 | 99 | def get_dataset(): 100 | 101 | # Read in the mnist data so we have it loaded globally: 102 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() 103 | x_train = x_train.astype(numpy.float32) 104 | x_test = x_test.astype(numpy.float32) 105 | 106 | x_train /= 255. 107 | x_test /= 255. 108 | 109 | y_train = y_train.astype(numpy.int32) 110 | y_test = y_test.astype(numpy.int32) 111 | 112 | return x_train, x_test, y_train, y_test 113 | 114 | 115 | def fetch_batch(_batch_size): 116 | x_train, x_test, y_train, y_test = get_dataset() 117 | 118 | indexes = numpy.random.choice(a=x_train.shape[0], size=[_batch_size,]) 119 | 120 | images = x_train[indexes].reshape(_batch_size, 28, 28, 1) 121 | labels = y_train[indexes].reshape(_batch_size, 1) 122 | 123 | return images, labels 124 | 125 | 126 | @profile 127 | def forward_pass(model, batch_size): 128 | batch_data, y_true = fetch_batch(batch_size) 129 | y_pred = model(batch_data) 130 | loss = compute_loss(y_true, y_pred) 131 | return loss 132 | 133 | 134 | @profile 135 | def train_loop(batch_size, n_training_epochs, model, opt, global_size): 136 | 137 | logger = logging.getLogger() 138 | 139 | rank = hvd.rank() 140 | for i_epoch in range(n_training_epochs): 141 | 142 | epoch_steps = int(60000/batch_size) 143 | 144 | for i_batch in range(epoch_steps): 145 | 146 | start = time.time() 147 | 148 | with tf.GradientTape() as tape: 149 | loss = forward_pass(model, batch_size) 150 | 151 | if global_size != 1: 152 | tape = hvd.DistributedGradientTape(tape) 153 | 154 | trainable_vars = model.trainable_variables 155 | 156 | # Apply the update to the network (one at a time): 157 | grads = tape.gradient(loss, trainable_vars) 158 | 159 | opt.apply_gradients(zip(grads, trainable_vars)) 160 | 161 | end = time.time() 162 | 163 | images = batch_size*global_size 164 | 165 | logger.info(f"({i_epoch}, {i_batch}), Loss: {loss:.3f}, step_time: {end-start :.3f}, throughput: {images/(end-start):.3f} img/s.") 166 | 167 | 168 | @profile 169 | def train_network(_batch_size, _training_iterations, _lr, global_size): 170 | 171 | mnist_model = MNISTClassifier() 172 | 173 | opt = tf.keras.optimizers.Adam(_lr) 174 | 175 | if global_size != 1: 176 | hvd.broadcast_variables(mnist_model.variables, root_rank=0) 177 | hvd.broadcast_variables(opt.variables(), root_rank=0) 178 | 179 | train_loop(_batch_size, _training_iterations, mnist_model, opt, global_size) 180 | 181 | 182 | if __name__ == '__main__': 183 | 184 | rank, size = init_mpi() 185 | configure_logger(rank) 186 | 187 | parser = argparse.ArgumentParser(description='TensorFlow MNIST Example') 188 | parser.add_argument('--batch_size', type=int, default=64, metavar='N', 189 | help='input batch size for training (default: 64)') 190 | parser.add_argument('--epochs', type=int, default=10, metavar='N', 191 | help='number of epochs to train (default: 10)') 192 | parser.add_argument('--lr', type=float, default=0.01, metavar='LR', 193 | help='learning rate (default: 0.01)') 194 | # parser.add_argument('--device', default='cpu', 195 | # help='Wheter this is running on cpu or gpu') 196 | # parser.add_argument('--num_inter', default=2, help='set number inter', type=int) 197 | # parser.add_argument('--num_intra', default=0, help='set number intra', type=int) 198 | # parser.add_argument('--warmup_epochs', default=3, help='number of warmup epochs', 199 | # type=int) 200 | 201 | args = parser.parse_args() 202 | scaled_lr = args.lr * hvd.size() 203 | train_network(args.batch_size, args.epochs, scaled_lr, size) 204 | -------------------------------------------------------------------------------- /03_profileLearning/reduced_precision/README.md: -------------------------------------------------------------------------------- 1 | # Reduced and Mixed Precision 2 | 3 | *Note*: for a more extensive discussion and tutorial related to mixed precision training of neural networks, refer to [CPW21: Reduced and Mixed Precision](https://github.com/argonne-lcf/CompPerfWorkshop-2021/tree/main/10_reduced-precision). 4 | 5 | Switching to reduced or mixed precision is not that hard in TensorFlow's Keras API: 6 | ```python 7 | tf.keras.mixed_precision.set_global_policy("mixed_float16") 8 | ``` 9 | 10 | In this case, there are also a bunch of places where I hard-coded `float32` - oops! Fix those too. The final layer (softmax) should **not** used mixed precision. 11 | 12 | In general, loss scaling should be used to prevent numerical underflow (and sometimes overflow) when using `float16`. See [the TensorFlow guide to Mixed Precision](https://www.tensorflow.org/guide/mixed_precision#training_the_model_with_a_custom_training_loop) for more details. 13 | 14 | Now run the code with mixed precision and the default Keras loss scale optimizer (both XLA and profiling are disabled here): 15 | ``` 16 | python train_MNIST_tf_function_XLA_mixed.py --epochs 1 --batch_size 1024 17 | 18 | ... 19 | 2021-08-10 02:35:00,466 - INFO - (0, 46), Loss: 0.11277, step_time: 0.00532, throughput: 1.93e+05 img/s. 20 | 2021-08-10 02:35:00,473 - INFO - (0, 47), Loss: 0.08135, step_time: 0.00533, throughput: 1.92e+05 img/s. 21 | 2021-08-10 02:35:00,485 - INFO - (0, 48), Loss: 0.10929, step_time: 0.01039, throughput: 9.85e+04 img/s. 22 | 2021-08-10 02:35:00,493 - INFO - (0, 49), Loss: 0.10958, step_time: 0.00538, throughput: 1.90e+05 img/s. 23 | 2021-08-10 02:35:00,500 - INFO - (0, 50), Loss: 0.09276, step_time: 0.00539, throughput: 1.90e+05 img/s. 24 | 2021-08-10 02:35:00,508 - INFO - (0, 51), Loss: 0.09720, step_time: 0.00543, throughput: 1.88e+05 img/s. 25 | 2021-08-10 02:35:00,515 - INFO - (0, 52), Loss: 0.07887, step_time: 0.00538, throughput: 1.90e+05 img/s. 26 | 2021-08-10 02:35:00,523 - INFO - (0, 53), Loss: 0.06713, step_time: 0.00526, throughput: 1.95e+05 img/s. 27 | ``` 28 | The performance has gotten significantly worse (5x slower than before). Quick investigation reveals that the _dynamic_ loss scaling optimizer is responsible for the degradation. Typically, such default, automated loss scaling is relatively inexpensive and should be used. However, with such a simple network architecture and training loop, the local work is so small and the image throughput is so high, that even this small extra cost can harm your peak efficiency. 29 | 30 | Comment out the default `LossScaleOptimizer` and use the one with `dyanmic=False`: 31 | ``` 32 | # Dynamic loss scaling (more expensive, but more reliable) 33 | #opt = tf.keras.mixed_precision.LossScaleOptimizer(tf.keras.optimizers.Adam(_lr)) 34 | # Fixed loss scaling (cheap) 35 | opt = tf.keras.mixed_precision.LossScaleOptimizer( 36 | tf.keras.optimizers.Adam(_lr), 37 | dynamic=False, 38 | initial_scale=1024, 39 | ) 40 | ``` 41 | If you use too large of an `initial_scale`, the loss function will return NaNs. Using a fixed scale of 1024 seems fairly robust for our case: 42 | ``` 43 | 2021-08-10 02:30:46,418 - INFO - (0, 48), Loss: 0.08706, step_time: 0.00100, throughput: 1.02e+06 img/s. 44 | 2021-08-10 02:30:46,425 - INFO - (0, 49), Loss: 0.10371, step_time: 0.00101, throughput: 1.02e+06 img/s. 45 | 2021-08-10 02:30:46,432 - INFO - (0, 50), Loss: 0.09754, step_time: 0.00100, throughput: 1.03e+06 img/s. 46 | 2021-08-10 02:30:46,438 - INFO - (0, 51), Loss: 0.09409, step_time: 0.00099, throughput: 1.03e+06 img/s. 47 | 2021-08-10 02:30:46,445 - INFO - (0, 52), Loss: 0.06493, step_time: 0.00101, throughput: 1.01e+06 img/s. 48 | 2021-08-10 02:30:46,451 - INFO - (0, 53), Loss: 0.05253, step_time: 0.00100, throughput: 1.03e+06 img/s. 49 | 2021-08-10 02:30:46,458 - INFO - (0, 54), Loss: 0.07965, step_time: 0.00100, throughput: 1.02e+06 img/s. 50 | 2021-08-10 02:30:46,465 - INFO - (0, 55), Loss: 0.06847, step_time: 0.00100, throughput: 1.02e+06 img/s. 51 | ``` 52 | 53 | This is disappointing! Just like with XLA, we ran with mixed precision and it is slightly SLOWER. Let's look into the profile to discover why. 54 | 55 | Here's the overview page. We note right away that in the bottom left, it IS using a good amount of reduced precision. 56 | 57 | ![Tensorboard Profiler Overview](./images/profiler_overview.png) 58 | 59 | Scrolling down: 60 | 61 | ![top 10](./images/top10-ops.png) 62 | 63 | Compared to the float32 top-10 operations, this is pretty different. There is one op that is particularly dominant! It is still a conv2d operation, but for some reason it is much slower than the others. 64 | 65 | Here is the Kernel Statistics page again: 66 | 67 | ![kernel stats](./images/kernel-stats.png) 68 | 69 | We see the same problem there - except this time it's pointing to the wgrad (weight gradient) calculation of the second Conv2D layer. The TensorFlow Statistics shows similar info: 70 | 71 | ![tf stats](./images/tf-stats.png) 72 | 73 | And there is also a timeline view of all ops (trace viewer) 74 | 75 | ![timeline](./images/trace-viewer.png) 76 | 77 | And zoomed: 78 | 79 | ![timeline zoom](./images/trace-zoomed.png) 80 | 81 | Compare this to the trace viewer for the `float32` case: 82 | 83 | ![timelline zoom fp32](./images/trace-zoomed-float32.png) 84 | 85 | So, reduced precision appears to be slower because of a marginally larger kernel launch 86 | time for the `float16` kernels vs. the `float32` versions of the kernels (which are 87 | **not** using the TensorCores of the A100). This especially affects the backpropagation 88 | weight gradient calculation of the second Conv2D layer, as seen above. 89 | -------------------------------------------------------------------------------- /03_profileLearning/reduced_precision/images/kernel-stats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/reduced_precision/images/kernel-stats.png -------------------------------------------------------------------------------- /03_profileLearning/reduced_precision/images/profiler_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/reduced_precision/images/profiler_overview.png -------------------------------------------------------------------------------- /03_profileLearning/reduced_precision/images/tf-stats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/reduced_precision/images/tf-stats.png -------------------------------------------------------------------------------- /03_profileLearning/reduced_precision/images/top10-ops.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/reduced_precision/images/top10-ops.png -------------------------------------------------------------------------------- /03_profileLearning/reduced_precision/images/trace-viewer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/reduced_precision/images/trace-viewer.png -------------------------------------------------------------------------------- /03_profileLearning/reduced_precision/images/trace-zoomed-float32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/reduced_precision/images/trace-zoomed-float32.png -------------------------------------------------------------------------------- /03_profileLearning/reduced_precision/images/trace-zoomed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/reduced_precision/images/trace-zoomed.png -------------------------------------------------------------------------------- /03_profileLearning/tf_function/README.md: -------------------------------------------------------------------------------- 1 | # TF Function and Graph Compilation 2 | 3 | Using line profiler showed us that the largest computation use, by far, was the train loop and is subcalls. Here, we'll wrap those functions in `@tf.function` decorators to improve performance with graph compilation. Additionally, some of the operations have to be re-written to stay entirely within the TensorFlow library and not use NumPy calls. 4 | 5 | This has a mild overhead at the start for tracing but then runs more quickly. Observe how the first few iterations are slower: 6 | ``` 7 | python train_MNIST_tf_function.py --epochs 1 8 | ``` 9 | Out of the box, we see a dramatic speed up: 10 | ``` 11 | 2021-08-10 01:46:46,371 - INFO - (0, 923), Loss: 0.001, step_time: 0.001, throughput: 74358.852 img/s. 12 | 2021-08-10 01:46:46,372 - INFO - (0, 924), Loss: 0.000, step_time: 0.001, throughput: 74030.738 img/s. 13 | 2021-08-10 01:46:46,374 - INFO - (0, 925), Loss: 0.000, step_time: 0.001, throughput: 66296.729 img/s. 14 | 2021-08-10 01:46:46,375 - INFO - (0, 926), Loss: 0.123, step_time: 0.001, throughput: 74379.456 img/s. 15 | 2021-08-10 01:46:46,377 - INFO - (0, 927), Loss: 0.035, step_time: 0.001, throughput: 67480.004 img/s. 16 | 2021-08-10 01:46:46,378 - INFO - (0, 928), Loss: 0.040, step_time: 0.001, throughput: 73847.443 img/s. 17 | 2021-08-10 01:46:46,380 - INFO - (0, 929), Loss: 0.017, step_time: 0.001, throughput: 73564.115 img/s. 18 | 2021-08-10 01:46:46,381 - INFO - (0, 930), Loss: 0.000, step_time: 0.001, throughput: 71335.492 img/s. 19 | 2021-08-10 01:46:46,383 - INFO - (0, 931), Loss: 0.007, step_time: 0.001, throughput: 71985.909 img/s. 20 | 2021-08-10 01:46:46,384 - INFO - (0, 932), Loss: 0.268, step_time: 0.001, throughput: 72354.570 img/s. 21 | 2021-08-10 01:46:46,385 - INFO - (0, 933), Loss: 0.443, step_time: 0.001, throughput: 73282.953 img/s. 22 | 2021-08-10 01:46:46,387 - INFO - (0, 934), Loss: 0.047, step_time: 0.001, throughput: 74214.945 img/s. 23 | 2021-08-10 01:46:46,388 - INFO - (0, 935), Loss: 0.011, step_time: 0.001, throughput: 74153.441 img/s. 24 | ``` 25 | 26 | Instead of 230 Img/s, we're at 74,000 Img/s. But note that the step time is now only 0.001 seconds. We're likely not doing enough work per batch. Increase the batch size from 64 to 1024 or 2048: 27 | ``` 28 | python train_MNIST_tf_function.py --epochs 1 --batch_size 1024 29 | ``` 30 | Which yields: 31 | ``` 32 | 2021-08-10 01:44:13,671 - INFO - (0, 29), Loss: 0.158, step_time: 0.001, throughput: 1099582.001 img/s. 33 | 2021-08-10 01:44:13,679 - INFO - (0, 30), Loss: 0.202, step_time: 0.001, throughput: 1085958.861 img/s. 34 | 2021-08-10 01:44:13,686 - INFO - (0, 31), Loss: 0.172, step_time: 0.001, throughput: 1093980.463 img/s. 35 | 2021-08-10 01:44:13,694 - INFO - (0, 32), Loss: 0.140, step_time: 0.001, throughput: 1093423.446 img/s. 36 | 2021-08-10 01:44:13,701 - INFO - (0, 33), Loss: 0.117, step_time: 0.001, throughput: 1087608.837 img/s. 37 | 2021-08-10 01:44:13,709 - INFO - (0, 34), Loss: 0.120, step_time: 0.001, throughput: 1074278.963 img/s. 38 | 2021-08-10 01:44:13,716 - INFO - (0, 35), Loss: 0.140, step_time: 0.001, throughput: 1052172.292 img/s. 39 | 2021-08-10 01:44:13,724 - INFO - (0, 36), Loss: 0.184, step_time: 0.001, throughput: 1062584.685 img/s. 40 | 2021-08-10 01:44:13,731 - INFO - (0, 37), Loss: 0.108, step_time: 0.001, throughput: 1053721.123 img/s. 41 | 2021-08-10 01:44:13,739 - INFO - (0, 38), Loss: 0.112, step_time: 0.001, throughput: 1064691.942 img/s 42 | ``` 43 | 44 | We also can enable [XLA Fusion](https://www.tensorflow.org/xla) (for GPU or CPU) to speed up the computations by fusing small ops together. 45 | 46 | XLA can be enabled (for code within `tf.function` sections) without any changes to your code by setting an environment variable on the command line: 47 | `TF_XLA_FLAGS=--tf_xla_auto_jit=2 python train_MNIST_tf_function.py --epochs 1 --batch_size 1024` 48 | This examines your `tf.function()` decorated functions and automatically decides if/how to cluster them in XLA operations. 49 | > A simple way to start using XLA in TensorFlow models without any changes is to enable auto-clustering, which automatically finds clusters (connected subgraphs) within the TensorFlow functions which can be compiled and executed using XLA. 50 | 51 | Or, XLA can be manually enabled by explicitly setting `@tf.function(jit_compile=True)` for the desired functions. This option was formerly called `experimental_compile` (before TensorFlow v2.5.0). Try: 52 | ``` 53 | python train_MNIST_tf_function_XLA.py --epochs 1 --batch_size 1024 54 | ``` 55 | 56 | In this case, XLA actually **harms performance** as our training throughput drops to 650k images/second: 57 | ``` 58 | 2021-08-10 01:55:04,462 - INFO - (0, 10), Loss: 0.381, step_time: 0.002, throughput: 643248.060 img/s. 59 | 2021-08-10 01:55:04,470 - INFO - (0, 11), Loss: 0.500, step_time: 0.002, throughput: 646929.853 img/s. 60 | 2021-08-10 01:55:04,478 - INFO - (0, 12), Loss: 0.506, step_time: 0.002, throughput: 652929.051 img/s. 61 | 2021-08-10 01:55:04,486 - INFO - (0, 13), Loss: 0.371, step_time: 0.002, throughput: 646345.718 img/s. 62 | 2021-08-10 01:55:04,494 - INFO - (0, 14), Loss: 0.405, step_time: 0.002, throughput: 650752.621 img/s. 63 | 2021-08-10 01:55:04,502 - INFO - (0, 15), Loss: 0.296, step_time: 0.002, throughput: 654022.734 img/s. 64 | 2021-08-10 01:55:04,510 - INFO - (0, 16), Loss: 0.255, step_time: 0.002, throughput: 651147.255 img/s. 65 | 2021-08-10 01:55:04,518 - INFO - (0, 17), Loss: 0.250, step_time: 0.002, throughput: 648982.668 img/s. 66 | 2021-08-10 01:55:04,526 - INFO - (0, 18), Loss: 0.195, step_time: 0.002, throughput: 657124.739 img/s. 67 | 2021-08-10 01:55:04,534 - INFO - (0, 19), Loss: 0.213, step_time: 0.002, throughput: 653525.152 img/s. 68 | ``` 69 | 70 | As an aside, using a slightly older vesion of TensorFlow in the July 2021 NGC image release leads to a seg-fault when XLA is used on any of the `tf.function` sections other than `compute_loss()`. The generated XLA programs can be dumped with: 71 | ``` 72 | TF_DUMP_GRAPH_PREFIX=/tmp/generated TF_XLA_FLAGS="--tf_xla_clustering_debug --tf_xla_auto_jit=2" XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=/tmp/generated" python train_MNIST_tf_function.py --epochs 1 --batch_size 1024 73 | ``` 74 | and are kept for reference in [`xla_bug_generated/`](./xla_bug_generated/). Feel free to try it yourself (and maybe open a bug report in the TensorFlow repository!): 75 | `/lus/theta-fs0/software/thetagpu/nvidia-containers/tensorflow2/tf2_21.04-py3.simg` 76 | 77 | Beyond this, we'll have to run the TensorFlow Profiler. That is in the next folder, [`tf_profiler/`](../tf_profiler/). 78 | -------------------------------------------------------------------------------- /03_profileLearning/tf_function/train_MNIST_tf_function.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import argparse 4 | import logging 5 | from logging import handlers 6 | 7 | import tensorflow as tf 8 | import numpy 9 | 10 | import horovod.tensorflow as hvd 11 | 12 | 13 | # Read in the mnist data so we have it loaded globally: 14 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() 15 | x_train = x_train.astype(numpy.float32) 16 | x_test = x_test.astype(numpy.float32) 17 | 18 | x_train /= 255. 19 | x_test /= 255. 20 | 21 | y_train = y_train.astype(numpy.int32) 22 | y_test = y_test.astype(numpy.int32) 23 | 24 | dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) 25 | dataset.shuffle(60000) 26 | 27 | 28 | def init_mpi(): 29 | # Using the presence of an env variable to determine if we're using MPI: 30 | try: 31 | hvd.init() 32 | return hvd.rank(), hvd.size() 33 | except: 34 | if "mpirun" in sys.argv or "mpiexec" in sys.argv: 35 | raise Exception("MPI detected in command line but was not able to init!") 36 | return 0, 1 37 | 38 | 39 | def configure_logger(rank): 40 | '''Configure a global logger 41 | 42 | Adds a stream handler and a file hander, buffers to file (10 lines) but not to stdout. 43 | 44 | Submit the MPI Rank 45 | 46 | ''' 47 | logger = logging.getLogger() 48 | 49 | # Create a handler for STDOUT, but only on the root rank. 50 | # If not distributed, we still get 0 passed in here. 51 | if rank == 0: 52 | stream_handler = logging.StreamHandler() 53 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 54 | stream_handler.setFormatter(formatter) 55 | handler = handlers.MemoryHandler(capacity=0, target=stream_handler) 56 | logger.addHandler(handler) 57 | 58 | # Add a file handler too: 59 | log_file = "process.log" 60 | file_handler = logging.FileHandler(log_file) 61 | file_handler.setFormatter(formatter) 62 | file_handler = handlers.MemoryHandler(capacity=10, target=file_handler) 63 | logger.addHandler(file_handler) 64 | 65 | logger.setLevel(logging.INFO) 66 | else: 67 | # in this case, MPI is available but it's not rank 0 68 | # create a null handler 69 | handler = logging.NullHandler() 70 | logger.addHandler(handler) 71 | logger.setLevel(logging.INFO) 72 | 73 | 74 | class MNISTClassifier(tf.keras.models.Model): 75 | 76 | def __init__(self, activation=tf.nn.tanh): 77 | tf.keras.models.Model.__init__(self) 78 | 79 | self.conv_1 = tf.keras.layers.Conv2D(32, [3, 3], activation='relu') 80 | self.conv_2 = tf.keras.layers.Conv2D(64, [3, 3], activation='relu') 81 | self.pool_3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2)) 82 | self.drop_4 = tf.keras.layers.Dropout(0.25) 83 | self.dense_5 = tf.keras.layers.Dense(128, activation='relu') 84 | self.drop_6 = tf.keras.layers.Dropout(0.5) 85 | self.dense_7 = tf.keras.layers.Dense(10, activation='softmax') 86 | 87 | #@tf.function 88 | def call(self, inputs): 89 | ''' 90 | Reshape at input and output: 91 | ''' 92 | # batch_size = inputs.shape[0] 93 | 94 | x = self.conv_1(inputs) 95 | x = self.conv_2(x) 96 | x = self.pool_3(x) 97 | x = self.drop_4(x) 98 | x = tf.keras.layers.Flatten()(x) 99 | x = self.dense_5(x) 100 | x = self.drop_6(x) 101 | x = self.dense_7(x) 102 | 103 | return x 104 | 105 | 106 | #@tf.function 107 | def compute_loss(y_true, y_pred): 108 | # if labels are integers, use sparse categorical crossentropy 109 | # network's final layer is softmax, so from_logtis=False 110 | scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False) 111 | # if labels are one-hot encoded, use standard crossentropy 112 | 113 | return scce(y_true, y_pred) # .numpy() 114 | 115 | 116 | #@tf.function 117 | def forward_pass(model, batch_data, y_true): 118 | y_pred = model(batch_data) 119 | loss = compute_loss(y_true, y_pred) 120 | return loss 121 | 122 | 123 | def train_loop(batch_size, n_training_epochs, model, opt, global_size): 124 | 125 | @tf.function 126 | def train_iteration(data, y_true, model, opt, global_size): 127 | with tf.GradientTape() as tape: 128 | loss = forward_pass(model, data, y_true) 129 | 130 | if global_size != 1: 131 | tape = hvd.DistributedGradientTape(tape) 132 | 133 | trainable_vars = model.trainable_variables 134 | 135 | # Apply the update to the network (one at a time): 136 | grads = tape.gradient(loss, trainable_vars) 137 | 138 | opt.apply_gradients(zip(grads, trainable_vars)) 139 | return loss 140 | 141 | 142 | logger = logging.getLogger() 143 | 144 | rank = hvd.rank() 145 | for i_epoch in range(n_training_epochs): 146 | 147 | epoch_steps = int(60000/batch_size) 148 | dataset.shuffle(60000) # Shuffle the whole dataset in memory 149 | batches = dataset.batch(batch_size=batch_size, drop_remainder=True) 150 | 151 | for i_batch, (batch_data, y_true) in enumerate(batches): 152 | 153 | batch_data = tf.reshape(batch_data, [-1, 28, 28, 1]) 154 | 155 | start = time.time() 156 | 157 | loss = train_iteration(batch_data, y_true, model, opt, global_size) 158 | 159 | end = time.time() 160 | 161 | images = batch_size*global_size 162 | 163 | logger.info(f"({i_epoch}, {i_batch}), Loss: {loss:.3f}, step_time: {end-start :.3f}, throughput: {images/(end-start):.3f} img/s.") 164 | 165 | 166 | def train_network(_batch_size, _training_iterations, _lr, global_size): 167 | 168 | mnist_model = MNISTClassifier() 169 | 170 | opt = tf.keras.optimizers.Adam(_lr) 171 | 172 | if global_size != 1: 173 | hvd.broadcast_variables(mnist_model.variables, root_rank=0) 174 | hvd.broadcast_variables(opt.variables(), root_rank=0) 175 | 176 | train_loop(_batch_size, _training_iterations, mnist_model, opt, global_size) 177 | 178 | 179 | if __name__ == '__main__': 180 | 181 | rank, size = init_mpi() 182 | configure_logger(rank) 183 | 184 | parser = argparse.ArgumentParser(description='TensorFlow MNIST Example') 185 | parser.add_argument('--batch_size', type=int, default=64, metavar='N', 186 | help='input batch size for training (default: 64)') 187 | parser.add_argument('--epochs', type=int, default=10, metavar='N', 188 | help='number of epochs to train (default: 10)') 189 | parser.add_argument('--lr', type=float, default=0.01, metavar='LR', 190 | help='learning rate (default: 0.01)') 191 | # parser.add_argument('--device', default='cpu', 192 | # help='Wheter this is running on cpu or gpu') 193 | # parser.add_argument('--num_inter', default=2, help='set number inter', type=int) 194 | # parser.add_argument('--num_intra', default=0, help='set number intra', type=int) 195 | # parser.add_argument('--warmup_epochs', default=3, help='number of warmup epochs', 196 | # type=int) 197 | 198 | args = parser.parse_args() 199 | scaled_lr = args.lr * hvd.size() 200 | train_network(args.batch_size, args.epochs, scaled_lr, size) 201 | -------------------------------------------------------------------------------- /03_profileLearning/tf_function/train_MNIST_tf_function_XLA.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import argparse 4 | import logging 5 | from logging import handlers 6 | 7 | import tensorflow as tf 8 | import numpy 9 | 10 | import horovod.tensorflow as hvd 11 | 12 | 13 | # Read in the mnist data so we have it loaded globally: 14 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() 15 | x_train = x_train.astype(numpy.float32) 16 | x_test = x_test.astype(numpy.float32) 17 | 18 | x_train /= 255. 19 | x_test /= 255. 20 | 21 | y_train = y_train.astype(numpy.int32) 22 | y_test = y_test.astype(numpy.int32) 23 | 24 | dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) 25 | dataset.shuffle(60000) 26 | 27 | 28 | def init_mpi(): 29 | # Using the presence of an env variable to determine if we're using MPI: 30 | try: 31 | hvd.init() 32 | return hvd.rank(), hvd.size() 33 | except: 34 | if "mpirun" in sys.argv or "mpiexec" in sys.argv: 35 | raise Exception("MPI detected in command line but was not able to init!") 36 | return 0, 1 37 | 38 | 39 | def configure_logger(rank): 40 | '''Configure a global logger 41 | 42 | Adds a stream handler and a file hander, buffers to file (10 lines) but not to stdout. 43 | 44 | Submit the MPI Rank 45 | 46 | ''' 47 | logger = logging.getLogger() 48 | 49 | # Create a handler for STDOUT, but only on the root rank. 50 | # If not distributed, we still get 0 passed in here. 51 | if rank == 0: 52 | stream_handler = logging.StreamHandler() 53 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 54 | stream_handler.setFormatter(formatter) 55 | handler = handlers.MemoryHandler(capacity=0, target=stream_handler) 56 | logger.addHandler(handler) 57 | 58 | # Add a file handler too: 59 | log_file = "process.log" 60 | file_handler = logging.FileHandler(log_file) 61 | file_handler.setFormatter(formatter) 62 | file_handler = handlers.MemoryHandler(capacity=10, target=file_handler) 63 | logger.addHandler(file_handler) 64 | 65 | logger.setLevel(logging.INFO) 66 | else: 67 | # in this case, MPI is available but it's not rank 0 68 | # create a null handler 69 | handler = logging.NullHandler() 70 | logger.addHandler(handler) 71 | logger.setLevel(logging.INFO) 72 | 73 | 74 | class MNISTClassifier(tf.keras.models.Model): 75 | 76 | def __init__(self, activation=tf.nn.tanh): 77 | tf.keras.models.Model.__init__(self) 78 | 79 | self.conv_1 = tf.keras.layers.Conv2D(32, [3, 3], activation='relu') 80 | self.conv_2 = tf.keras.layers.Conv2D(64, [3, 3], activation='relu') 81 | self.pool_3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2)) 82 | self.drop_4 = tf.keras.layers.Dropout(0.25) 83 | self.dense_5 = tf.keras.layers.Dense(128, activation='relu') 84 | self.drop_6 = tf.keras.layers.Dropout(0.5) 85 | self.dense_7 = tf.keras.layers.Dense(10, activation='softmax') 86 | 87 | #@tf.function(jit_compile=True) 88 | def call(self, inputs): 89 | ''' 90 | Reshape at input and output: 91 | ''' 92 | # batch_size = inputs.shape[0] 93 | 94 | x = self.conv_1(inputs) 95 | x = self.conv_2(x) 96 | x = self.pool_3(x) 97 | x = self.drop_4(x) 98 | x = tf.keras.layers.Flatten()(x) 99 | x = self.dense_5(x) 100 | x = self.drop_6(x) 101 | x = self.dense_7(x) 102 | 103 | return x 104 | 105 | 106 | #@tf.function(jit_compile=True) 107 | def compute_loss(y_true, y_pred): 108 | # if labels are integers, use sparse categorical crossentropy 109 | # network's final layer is softmax, so from_logtis=False 110 | scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False) 111 | # if labels are one-hot encoded, use standard crossentropy 112 | 113 | return scce(y_true, y_pred) # .numpy() 114 | 115 | 116 | #@tf.function(jit_compile=True) 117 | def forward_pass(model, batch_data, y_true): 118 | y_pred = model(batch_data) 119 | loss = compute_loss(y_true, y_pred) 120 | return loss 121 | 122 | 123 | def train_loop(batch_size, n_training_epochs, model, opt, global_size): 124 | 125 | @tf.function(jit_compile=True) 126 | def train_iteration(data, y_true, model, opt, global_size): 127 | with tf.GradientTape() as tape: 128 | loss = forward_pass(model, data, y_true) 129 | 130 | if global_size != 1: 131 | tape = hvd.DistributedGradientTape(tape) 132 | 133 | trainable_vars = model.trainable_variables 134 | 135 | # Apply the update to the network (one at a time): 136 | grads = tape.gradient(loss, trainable_vars) 137 | 138 | opt.apply_gradients(zip(grads, trainable_vars)) 139 | return loss 140 | 141 | 142 | logger = logging.getLogger() 143 | 144 | rank = hvd.rank() 145 | for i_epoch in range(n_training_epochs): 146 | 147 | epoch_steps = int(60000/batch_size) 148 | dataset.shuffle(60000) # Shuffle the whole dataset in memory 149 | batches = dataset.batch(batch_size=batch_size, drop_remainder=True) 150 | 151 | for i_batch, (batch_data, y_true) in enumerate(batches): 152 | 153 | batch_data = tf.reshape(batch_data, [-1, 28, 28, 1]) 154 | 155 | start = time.time() 156 | 157 | loss = train_iteration(batch_data, y_true, model, opt, global_size) 158 | 159 | end = time.time() 160 | 161 | images = batch_size*global_size 162 | 163 | logger.info(f"({i_epoch}, {i_batch}), Loss: {loss:.3f}, step_time: {end-start :.3f}, throughput: {images/(end-start):.3f} img/s.") 164 | 165 | 166 | def train_network(_batch_size, _training_iterations, _lr, global_size): 167 | 168 | mnist_model = MNISTClassifier() 169 | 170 | opt = tf.keras.optimizers.Adam(_lr) 171 | 172 | if global_size != 1: 173 | hvd.broadcast_variables(mnist_model.variables, root_rank=0) 174 | hvd.broadcast_variables(opt.variables(), root_rank=0) 175 | 176 | train_loop(_batch_size, _training_iterations, mnist_model, opt, global_size) 177 | 178 | 179 | if __name__ == '__main__': 180 | 181 | rank, size = init_mpi() 182 | configure_logger(rank) 183 | 184 | parser = argparse.ArgumentParser(description='TensorFlow MNIST Example') 185 | parser.add_argument('--batch_size', type=int, default=64, metavar='N', 186 | help='input batch size for training (default: 64)') 187 | parser.add_argument('--epochs', type=int, default=10, metavar='N', 188 | help='number of epochs to train (default: 10)') 189 | parser.add_argument('--lr', type=float, default=0.01, metavar='LR', 190 | help='learning rate (default: 0.01)') 191 | # parser.add_argument('--device', default='cpu', 192 | # help='Wheter this is running on cpu or gpu') 193 | # parser.add_argument('--num_inter', default=2, help='set number inter', type=int) 194 | # parser.add_argument('--num_intra', default=0, help='set number intra', type=int) 195 | # parser.add_argument('--warmup_epochs', default=3, help='number of warmup epochs', 196 | # type=int) 197 | 198 | args = parser.parse_args() 199 | scaled_lr = args.lr * hvd.size() 200 | train_network(args.batch_size, args.epochs, scaled_lr, size) 201 | -------------------------------------------------------------------------------- /03_profileLearning/tf_profiler/README.md: -------------------------------------------------------------------------------- 1 | # TF Profiling Tool 2 | 3 | 4 | Note the lines `tf.profiler.experimental.start('logdir')` and `tf.profiler.experimental.stop()` in the code. This sets up and tears down the profiling tool built in to TensorFlow. See the screenshots below - the main operation is conv2D backprop - a very compute heavy operation. We may get some performance improvement further with reduced precision - see the [`reduced_precision/`](../reduced_precision) folder. 5 | 6 | 7 | # Running the TensorFlow Profiler 8 | 9 | When you've captured your profile data, TensorBoard will dump it into the folder `logdir` (as above) and you will have to view it. The simplest way, for this application, is to copy it to your own laptop if you have TensorFlow installed. If not, you can run TensorBoard on ThetaGPU and use SSH port forwarding to view it on your own laptop. 10 | 11 | Whatever you do, you can open TensorBoard like so: 12 | ``` 13 | tensorboard --load_fast=false --logdir [your/own/path/to/logdir/] 14 | ``` 15 | For my local macOS system with TensorFlow 2.5.0, TensorBoard did not show any data unless the experimntal fast loading logic (which theoretically offers 100x-400x shorter loading times) was disabled. See https://github.com/tensorflow/tensorboard/issues/4784 16 | 17 | Next, open your browser and navigate to `localhost:6006` (or, whatever port you forwarded to) and you'll see a screen like the one below: 18 | 19 | ![TensorBoard Profiler Overview](./images/profiler_overview.png) 20 | 21 | And, if you scroll down, you'll see the list of the top 10 most time consuming operations: 22 | 23 | ![top 10](./images/top10_ops.png) 24 | 25 | This list shows us that the top operations are largely all convolution ops (particularly backprop). The profiler at the top also points out that 0% of the graph is in reduced precision, which could give us a speedup. We'll try that next but first let's review the other tabs: 26 | 27 | Here is the Kernel Statistics page: 28 | 29 | ![kernel stats](./images/kernel-stats.png) 30 | 31 | Again, this shows that the convolution operations are all the most dominant and equally distributed (roughly). 32 | 33 | The TensorFlow statistics shows similar info: 34 | 35 | ![tf stats](./images/tf-stats.png) 36 | 37 | And there is also a timeline view of all ops (trace viewer) 38 | 39 | ![timeline](./images/trace-viewer.png) 40 | 41 | And zoomed: 42 | 43 | ![timeline zoom](./images/trace-viewer-zoom.png) 44 | 45 | Now, let's try running in reduced precision. 46 | 47 | 48 | # Using Port Forwarding 49 | 50 | You can view the processes and how they occupy the compute resources in Tensorflow using tensorboard. 51 | 52 | You can login to Theta using: 53 | ```bash 54 | # our proxy port, must be > 1024 and every user needs a different port 55 | export PORT=10001 56 | # login to theta with a port forwarding 57 | ssh -D $PORT user@theta.alcf.anl.gov 58 | # load any conda environment that has a compatible tensorboard installation 59 | module load conda 60 | # add CUDA libraries if you are running on ThetaGPU 61 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/lus/theta-fs0/software/thetagpu/cuda/TensorRT-8.5.2.2/lib:/lus/theta-fs0/software/thetagpu/cuda/nccl_2.16.2-1+cuda11.8_x86_64/lib:/lus/theta-fs0/software/thetagpu/cuda/cudnn-linux-x86_64-8.6.0.163_cuda11-archive/lib:/lus/theta-fs0/software/thetagpu/cuda/cuda-11.8.0/extras/CUPTI/lib64:/lus/theta-fs0/software/thetagpu/cuda/cuda-11.8.0/lib64 62 | # start tensorboard (load_fast==false is a recent setting that seems to be needed until Tensorflow work's out the bugs) 63 | tensorboard --bind_all --logdir . --load_fast=false 64 | ``` 65 | Note the Port number that `tensorboard` reports when it starts up. 66 | 67 | Only 1 user can use a specific port so if you get an error choose another port number larger than `1024`. 68 | 69 | Once you have that setup. Set the Socks5 proxy of your favorite browser to host localhost and port $PORT (where $PORT is the value you used in the above script, like 10001). Now in the browser URL enter the login node on which you started tensorboard. For instance, if you are on thetalogin6, now you can type in thetalogin6.alcf.anl.gov:6006. Here 6006 is the port that tensorboard uses by default to start up it's web service, but may vary if you customize it. 70 | -------------------------------------------------------------------------------- /03_profileLearning/tf_profiler/images/kernel-stats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/tf_profiler/images/kernel-stats.png -------------------------------------------------------------------------------- /03_profileLearning/tf_profiler/images/profiler_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/tf_profiler/images/profiler_overview.png -------------------------------------------------------------------------------- /03_profileLearning/tf_profiler/images/tf-stats.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/tf_profiler/images/tf-stats.png -------------------------------------------------------------------------------- /03_profileLearning/tf_profiler/images/top10_ops.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/tf_profiler/images/top10_ops.png -------------------------------------------------------------------------------- /03_profileLearning/tf_profiler/images/trace-viewer-zoom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/tf_profiler/images/trace-viewer-zoom.png -------------------------------------------------------------------------------- /03_profileLearning/tf_profiler/images/trace-viewer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/tf_profiler/images/trace-viewer.png -------------------------------------------------------------------------------- /03_profileLearning/train_MNIST.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import argparse 4 | import logging 5 | from logging import handlers 6 | 7 | import tensorflow as tf 8 | import numpy 9 | 10 | import horovod.tensorflow as hvd 11 | 12 | 13 | def init_mpi(): 14 | # Using the presence of an env variable to determine if we're using MPI: 15 | try: 16 | hvd.init() 17 | return hvd.rank(), hvd.size() 18 | except: 19 | if "mpirun" in sys.argv or "mpiexec" in sys.argv: 20 | raise Exception("MPI detected in command line but was not able to init!") 21 | return 0, 1 22 | 23 | 24 | def configure_logger(rank): 25 | '''Configure a global logger 26 | 27 | Adds a stream handler and a file hander, buffers to file (10 lines) but not to stdout. 28 | 29 | Submit the MPI Rank 30 | 31 | ''' 32 | logger = logging.getLogger() 33 | 34 | # Create a handler for STDOUT, but only on the root rank. 35 | # If not distributed, we still get 0 passed in here. 36 | if rank == 0: 37 | stream_handler = logging.StreamHandler() 38 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 39 | stream_handler.setFormatter(formatter) 40 | handler = handlers.MemoryHandler(capacity=0, target=stream_handler) 41 | logger.addHandler(handler) 42 | 43 | # Add a file handler too: 44 | log_file = "process.log" 45 | file_handler = logging.FileHandler(log_file) 46 | file_handler.setFormatter(formatter) 47 | file_handler = handlers.MemoryHandler(capacity=10, target=file_handler) 48 | logger.addHandler(file_handler) 49 | 50 | logger.setLevel(logging.INFO) 51 | else: 52 | # in this case, MPI is available but it's not rank 0 53 | # create a null handler 54 | handler = logging.NullHandler() 55 | logger.addHandler(handler) 56 | logger.setLevel(logging.INFO) 57 | 58 | 59 | class MNISTClassifier(tf.keras.models.Model): 60 | 61 | def __init__(self, activation=tf.nn.tanh): 62 | tf.keras.models.Model.__init__(self) 63 | 64 | self.conv_1 = tf.keras.layers.Conv2D(32, [3, 3], activation='relu') 65 | self.conv_2 = tf.keras.layers.Conv2D(64, [3, 3], activation='relu') 66 | self.pool_3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2)) 67 | self.drop_4 = tf.keras.layers.Dropout(0.25) 68 | self.dense_5 = tf.keras.layers.Dense(128, activation='relu') 69 | self.drop_6 = tf.keras.layers.Dropout(0.5) 70 | self.dense_7 = tf.keras.layers.Dense(10, activation='softmax') 71 | 72 | def call(self, inputs): 73 | ''' 74 | Reshape at input and output: 75 | ''' 76 | # batch_size = inputs.shape[0] 77 | 78 | x = self.conv_1(inputs) 79 | x = self.conv_2(x) 80 | x = self.pool_3(x) 81 | x = self.drop_4(x) 82 | x = tf.keras.layers.Flatten()(x) 83 | x = self.dense_5(x) 84 | x = self.drop_6(x) 85 | x = self.dense_7(x) 86 | 87 | return x 88 | 89 | 90 | def compute_loss(y_true, y_pred): 91 | # if labels are integers, use sparse categorical crossentropy 92 | # network's final layer is softmax, so from_logtis=False 93 | scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False) 94 | # if labels are one-hot encoded, use standard crossentropy 95 | 96 | return scce(y_true, y_pred) # .numpy() 97 | 98 | 99 | def get_dataset(): 100 | 101 | # Read in the mnist data so we have it loaded globally: 102 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() 103 | x_train = x_train.astype(numpy.float32) 104 | x_test = x_test.astype(numpy.float32) 105 | 106 | x_train /= 255. 107 | x_test /= 255. 108 | 109 | y_train = y_train.astype(numpy.int32) 110 | y_test = y_test.astype(numpy.int32) 111 | 112 | return x_train, x_test, y_train, y_test 113 | 114 | 115 | def fetch_batch(_batch_size): 116 | x_train, x_test, y_train, y_test = get_dataset() 117 | 118 | indexes = numpy.random.choice(a=x_train.shape[0], size=[_batch_size,]) 119 | 120 | images = x_train[indexes].reshape(_batch_size, 28, 28, 1) 121 | labels = y_train[indexes].reshape(_batch_size, 1) 122 | 123 | return images, labels 124 | 125 | 126 | # Here is a function that will manage the training loop for us: 127 | 128 | def train_loop(batch_size, n_training_epochs, model, opt, global_size): 129 | 130 | logger = logging.getLogger() 131 | 132 | rank = hvd.rank() 133 | for i_epoch in range(n_training_epochs): 134 | 135 | epoch_steps = int(60000/batch_size) 136 | 137 | for i_batch in range(epoch_steps): 138 | 139 | start = time.time() 140 | 141 | with tf.GradientTape() as tape: 142 | batch_data, y_true = fetch_batch(batch_size) 143 | y_pred = model(batch_data) 144 | loss = compute_loss(y_true, y_pred) 145 | 146 | if global_size != 1: 147 | tape = hvd.DistributedGradientTape(tape) 148 | 149 | trainable_vars = model.trainable_variables 150 | 151 | # Apply the update to the network (one at a time): 152 | grads = tape.gradient(loss, trainable_vars) 153 | 154 | opt.apply_gradients(zip(grads, trainable_vars)) 155 | 156 | end = time.time() 157 | 158 | images = batch_size*global_size 159 | 160 | logger.info(f"({i_epoch}, {i_batch}), Loss: {loss:.3f}, step_time: {end-start :.3f}, throughput: {images/(end-start):.3f} img/s.") 161 | 162 | 163 | def train_network(_batch_size, _training_iterations, _lr, global_size): 164 | 165 | mnist_model = MNISTClassifier() 166 | 167 | opt = tf.keras.optimizers.Adam(_lr) 168 | 169 | if global_size != 1: 170 | hvd.broadcast_variables(mnist_model.variables, root_rank=0) 171 | hvd.broadcast_variables(opt.variables(), root_rank=0) 172 | 173 | train_loop(_batch_size, _training_iterations, mnist_model, opt, global_size) 174 | 175 | 176 | if __name__ == '__main__': 177 | 178 | rank, size = init_mpi() 179 | configure_logger(rank) 180 | 181 | parser = argparse.ArgumentParser(description='TensorFlow MNIST Example') 182 | parser.add_argument('--batch_size', type=int, default=64, metavar='N', 183 | help='input batch size for training (default: 64)') 184 | parser.add_argument('--epochs', type=int, default=10, metavar='N', 185 | help='number of epochs to train (default: 10)') 186 | parser.add_argument('--lr', type=float, default=0.01, metavar='LR', 187 | help='learning rate (default: 0.01)') 188 | # parser.add_argument('--device', default='cpu', 189 | # help='Wheter this is running on cpu or gpu') 190 | # parser.add_argument('--num_inter', default=2, help='set number inter', type=int) 191 | # parser.add_argument('--num_intra', default=0, help='set number intra', type=int) 192 | # parser.add_argument('--warmup_epochs', default=3, help='number of warmup epochs', 193 | # type=int) 194 | 195 | args = parser.parse_args() 196 | scaled_lr = args.lr * hvd.size() 197 | train_network(args.batch_size, args.epochs, scaled_lr, size) 198 | -------------------------------------------------------------------------------- /04_distributedLearning/ATPESC_2024_AIMLTrack_DDL_Zheng.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/04_distributedLearning/ATPESC_2024_AIMLTrack_DDL_Zheng.pdf -------------------------------------------------------------------------------- /04_distributedLearning/DeepSpeed/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "train_batch_size": 512, 3 | "steps_per_print": 2000, 4 | "optimizer": { 5 | "type": "Adam", 6 | "params": { 7 | "lr": 0.01, 8 | "betas": [ 9 | 0.8, 10 | 0.999 11 | ], 12 | "eps": 1e-8, 13 | "weight_decay": 3e-7 14 | } 15 | }, 16 | "scheduler": { 17 | "type": "WarmupLR", 18 | "params": { 19 | "warmup_min_lr": 0, 20 | "warmup_max_lr": 0.001, 21 | "warmup_num_steps": 1000 22 | } 23 | }, 24 | "gradient_clipping": 1.0, 25 | "prescale_gradients": false, 26 | "fp16": { 27 | "enabled": true, 28 | "fp16_master_weights_and_grads": false, 29 | "loss_scale": 0, 30 | "loss_scale_window": 500, 31 | "hysteresis": 2, 32 | "min_loss_scale": 1, 33 | "initial_scale_power": 15 34 | }, 35 | "wall_clock_breakdown": false, 36 | "zero_optimization": { 37 | "stage": 0, 38 | "allgather_partitions": true, 39 | "reduce_scatter": true, 40 | "allgather_bucket_size": 50000000, 41 | "reduce_bucket_size": 50000000, 42 | "overlap_comm": true, 43 | "contiguous_gradients": true, 44 | "cpu_offload": false 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /04_distributedLearning/Horovod/04_keras_cnn_concise.py: -------------------------------------------------------------------------------- 1 | # Turn of TF logs 2 | import os 3 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 4 | 5 | import tensorflow as tf 6 | 7 | import numpy 8 | import time 9 | 10 | 11 | import argparse 12 | parser = argparse.ArgumentParser(description='Horovod', 13 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 14 | parser.add_argument('--device', default='gpu', 15 | help='Wheter this is running on cpu or gpu') 16 | args = parser.parse_args() 17 | 18 | 19 | gpus = tf.config.experimental.list_physical_devices('GPU') 20 | for gpu in gpus: 21 | tf.config.experimental.set_memory_growth(gpu, True) 22 | 23 | 24 | # MNIST dataset 25 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() 26 | 27 | x_train = x_train.astype(numpy.float32) 28 | x_test = x_test.astype(numpy.float32) 29 | 30 | x_train /= 255. 31 | x_test /= 255. 32 | 33 | y_train = y_train.astype(numpy.int32) 34 | y_test = y_test.astype(numpy.int32) 35 | 36 | 37 | # Convolutional model 38 | 39 | class MNISTClassifier(tf.keras.models.Model): 40 | 41 | def __init__(self, activation=tf.nn.tanh): 42 | tf.keras.models.Model.__init__(self) 43 | 44 | self.conv_1 = tf.keras.layers.Conv2D(32, [3, 3], activation='relu') 45 | self.conv_2 = tf.keras.layers.Conv2D(64, [3, 3], activation='relu') 46 | self.pool_3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2)) 47 | self.drop_4 = tf.keras.layers.Dropout(0.25) 48 | self.dense_5 = tf.keras.layers.Dense(128, activation='relu') 49 | self.drop_6 = tf.keras.layers.Dropout(0.5) 50 | self.dense_7 = tf.keras.layers.Dense(10, activation='softmax') 51 | 52 | def call(self, inputs): 53 | 54 | x = self.conv_1(inputs) 55 | x = self.conv_2(x) 56 | x = self.pool_3(x) 57 | x = self.drop_4(x) 58 | x = tf.keras.layers.Flatten()(x) 59 | x = self.dense_5(x) 60 | x = self.drop_6(x) 61 | x = self.dense_7(x) 62 | 63 | return x 64 | 65 | 66 | 67 | def train_network_concise(_batch_size, _n_training_epochs, _lr): 68 | 69 | cnn_model = MNISTClassifier() 70 | 71 | cnn_model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=['accuracy']) 72 | 73 | x_train_reshaped = numpy.expand_dims(x_train, -1) 74 | 75 | history = cnn_model.fit(x_train_reshaped, y_train, batch_size=_batch_size, epochs=_n_training_epochs) 76 | return history, cnn_model 77 | 78 | batch_size = 512 79 | epochs = 20 80 | lr = .01 81 | t0 = time.time() 82 | history, cnn_model = train_network_concise(batch_size, epochs, lr) 83 | t1 = time.time() 84 | print("Total time is %s sec" %(t1-t0)) 85 | -------------------------------------------------------------------------------- /04_distributedLearning/Horovod/04_keras_cnn_concise_hvd.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 3 | os.environ['MPICH_GPU_SUPPORT_ENABLED']='0' 4 | import tensorflow as tf 5 | 6 | #HVD: (1) Initializing Horovod 7 | import horovod.tensorflow.keras as hvd 8 | hvd.init() 9 | print("I am rank %s of %s" %(hvd.rank(), hvd.size())) 10 | 11 | import numpy 12 | import time 13 | import argparse 14 | parser = argparse.ArgumentParser(description='Horovod', 15 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 16 | parser.add_argument('--device', default='gpu', 17 | help='Whether this is running on cpu or gpu') 18 | parser.add_argument('--epochs', default=50, type=int, help='Number of epochs to run') 19 | parser.add_argument('--warmup_epochs', default=3, type=int, help='Number of epochs to run') 20 | parser.add_argument('--learning_rate', '--lr', default=0.01, type=float) 21 | parser.add_argument('--batch_size', default=512, type=int) 22 | args = parser.parse_args() 23 | 24 | from tensorflow.python.client import device_lib 25 | 26 | def get_available_devices(): 27 | local_device_protos = device_lib.list_local_devices() 28 | return [x.name for x in local_device_protos if x.device_type == 'GPU' or x.device_type == 'CPU'] 29 | 30 | gpus = tf.config.experimental.list_physical_devices('GPU') 31 | for gpu in gpus: 32 | tf.config.experimental.set_memory_growth(gpu, True) 33 | #HVD: (2) Pin one GPU to specific horovod worker 34 | if gpus: 35 | tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') 36 | 37 | tf.config.threading.set_intra_op_parallelism_threads(0) 38 | tf.config.threading.set_inter_op_parallelism_threads(2) 39 | 40 | 41 | 42 | # MNIST dataset 43 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() 44 | 45 | x_train = x_train.astype(numpy.float32) 46 | x_test = x_test.astype(numpy.float32) 47 | 48 | x_train /= 255. 49 | x_test /= 255. 50 | 51 | y_train = y_train.astype(numpy.int32) 52 | y_test = y_test.astype(numpy.int32) 53 | 54 | 55 | 56 | 57 | 58 | # Convolutional model 59 | 60 | class MNISTClassifier(tf.keras.models.Model): 61 | 62 | def __init__(self, activation=tf.nn.tanh): 63 | tf.keras.models.Model.__init__(self) 64 | 65 | self.conv_1 = tf.keras.layers.Conv2D(32, [3, 3], activation='relu') 66 | self.conv_2 = tf.keras.layers.Conv2D(64, [3, 3], activation='relu') 67 | self.pool_3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2)) 68 | self.drop_4 = tf.keras.layers.Dropout(0.25) 69 | self.dense_5 = tf.keras.layers.Dense(128, activation='relu') 70 | self.drop_6 = tf.keras.layers.Dropout(0.5) 71 | self.dense_7 = tf.keras.layers.Dense(10, activation='softmax') 72 | 73 | def call(self, inputs): 74 | 75 | x = self.conv_1(inputs) 76 | x = self.conv_2(x) 77 | x = self.pool_3(x) 78 | x = self.drop_4(x) 79 | x = tf.keras.layers.Flatten()(x) 80 | x = self.dense_5(x) 81 | x = self.drop_6(x) 82 | x = self.dense_7(x) 83 | 84 | return x 85 | 86 | 87 | 88 | def train_network_concise(_batch_size, _n_training_epochs, _lr): 89 | 90 | cnn_model = MNISTClassifier() 91 | #HVD: (3) scale the learning rate 92 | opt = tf.optimizers.Adam(_lr*hvd.size()) 93 | #HVD: (4) add Horovod Distributed Optimizer 94 | opt = hvd.DistributedOptimizer(opt) 95 | # Specify `experimental_run_tf_function=False` 96 | cnn_model.compile(loss="sparse_categorical_crossentropy", optimizer=opt, metrics=['accuracy'], 97 | experimental_run_tf_function=False) 98 | #HVD: (5) Define call back 99 | callbacks = [ 100 | # broad cast 101 | hvd.callbacks.BroadcastGlobalVariablesCallback(0), 102 | # Average metric at the end of every epoch 103 | hvd.callbacks.MetricAverageCallback(), 104 | # Warmup 105 | hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=args.warmup_epochs, verbose=1, initial_lr=_lr*hvd.size()), 106 | ] 107 | #HVD: (6) save checkpoints only on worker 0 108 | if hvd.rank()==0: 109 | callbacks.append(tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5')) 110 | verbose=0 111 | if hvd.rank()==0: 112 | verbose=1 113 | x_train_reshaped = numpy.expand_dims(x_train, -1) 114 | #HVD: (7) Adjust the number of steps per epochs 115 | if (args.device=='cpu'): 116 | with tf.device('/device:CPU:0'): 117 | history = cnn_model.fit(x_train_reshaped, y_train, batch_size=_batch_size, epochs=_n_training_epochs, callbacks=callbacks, steps_per_epoch=60000//hvd.size()//_batch_size, verbose=verbose) 118 | else: 119 | history = cnn_model.fit(x_train_reshaped, y_train, batch_size=_batch_size, epochs=_n_training_epochs, callbacks=callbacks, steps_per_epoch=60000//hvd.size()//_batch_size, verbose=verbose) 120 | return history, cnn_model 121 | 122 | batch_size = args.batch_size 123 | epochs = args.epochs 124 | lr = args.learning_rate 125 | history, cnn_model = train_network_concise(batch_size, 1, lr) 126 | 127 | t0 = time.time() 128 | history, cnn_model = train_network_concise(batch_size, epochs-1, lr) 129 | t1 = time.time() 130 | if (hvd.rank()==0): 131 | print("Hvd Procs %d Total time: %s second" %(hvd.size(),t1-t0)) 132 | -------------------------------------------------------------------------------- /04_distributedLearning/Horovod/04_keras_cnn_verbose.py: -------------------------------------------------------------------------------- 1 | # Horovod example 2 | 3 | import tensorflow as tf 4 | 5 | import numpy 6 | import time 7 | 8 | import argparse 9 | parser = argparse.ArgumentParser(description='TensorFlow MNIST Example') 10 | parser.add_argument('--epochs', type=int, default=50, metavar='N', 11 | help='number of epochs to train (default: 10)') 12 | parser.add_argument('--device', default='gpu', 13 | help='Wheter this is running on cpu or gpu') 14 | parser.add_argument('--num_inter', default=2, help='set number inter', type=int) 15 | parser.add_argument('--num_intra', default=0, help='set number intra', type=int) 16 | 17 | args = parser.parse_args() 18 | tf.config.threading.set_intra_op_parallelism_threads(args.num_intra) 19 | tf.config.threading.set_inter_op_parallelism_threads(args.num_inter) 20 | gpus = tf.config.experimental.list_physical_devices('GPU') 21 | for gpu in gpus: 22 | tf.config.experimental.set_memory_growth(gpu, True) 23 | 24 | 25 | # MNIST dataset 26 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() 27 | 28 | x_train = x_train.astype(numpy.float32) 29 | x_test = x_test.astype(numpy.float32) 30 | 31 | x_train /= 255. 32 | x_test /= 255. 33 | 34 | y_train = y_train.astype(numpy.int32) 35 | y_test = y_test.astype(numpy.int32) 36 | 37 | 38 | # Convolutional model 39 | 40 | class MNISTClassifier(tf.keras.models.Model): 41 | 42 | def __init__(self, activation=tf.nn.tanh): 43 | tf.keras.models.Model.__init__(self) 44 | 45 | self.conv_1 = tf.keras.layers.Conv2D(32, [3, 3], activation='relu') 46 | self.conv_2 = tf.keras.layers.Conv2D(64, [3, 3], activation='relu') 47 | self.pool_3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2)) 48 | self.drop_4 = tf.keras.layers.Dropout(0.25) 49 | self.dense_5 = tf.keras.layers.Dense(128, activation='relu') 50 | self.drop_6 = tf.keras.layers.Dropout(0.5) 51 | self.dense_7 = tf.keras.layers.Dense(10, activation='softmax') 52 | 53 | def call(self, inputs): 54 | 55 | x = self.conv_1(inputs) 56 | x = self.conv_2(x) 57 | x = self.pool_3(x) 58 | x = self.drop_4(x) 59 | x = tf.keras.layers.Flatten()(x) 60 | x = self.dense_5(x) 61 | x = self.drop_6(x) 62 | x = self.dense_7(x) 63 | 64 | return x 65 | 66 | def compute_loss(y_true, y_pred): 67 | # if labels are integers, use sparse categorical crossentropy 68 | # network's final layer is softmax, so from_logtis=False 69 | scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False) 70 | # if labels are one-hot encoded, use standard crossentropy 71 | 72 | return scce(y_true, y_pred) 73 | 74 | 75 | def forward_pass(model, batch_data, y_true): 76 | y_pred = model(batch_data) 77 | loss = compute_loss(y_true, y_pred) 78 | return loss 79 | 80 | 81 | def train_loop(batch_size, n_training_epochs, model, opt): 82 | 83 | @tf.function() 84 | def train_iteration(data, y_true, model, opt): 85 | with tf.GradientTape() as tape: 86 | loss = forward_pass(model, data, y_true) 87 | 88 | trainable_vars = model.trainable_variables 89 | 90 | # Apply the update to the network (one at a time): 91 | grads = tape.gradient(loss, trainable_vars) 92 | 93 | opt.apply_gradients(zip(grads, trainable_vars)) 94 | return loss 95 | 96 | for i_epoch in range(n_training_epochs): 97 | print("beginning epoch %d" % i_epoch) 98 | start = time.time() 99 | 100 | epoch_steps = int(60000/batch_size) 101 | dataset.shuffle(60000) # Shuffle the whole dataset in memory 102 | batches = dataset.batch(batch_size=batch_size, drop_remainder=True) 103 | 104 | for i_batch, (batch_data, y_true) in enumerate(batches): 105 | batch_data = tf.reshape(batch_data, [-1, 28, 28, 1]) 106 | if (args.device=='cpu'): 107 | with tf.device("/cpu:0"): 108 | loss = train_iteration(batch_data, y_true, model, opt) 109 | else: 110 | loss = train_iteration(batch_data, y_true, model, opt) 111 | 112 | end = time.time() 113 | print("took %1.1f seconds for epoch #%d" % (end-start, i_epoch)) 114 | 115 | 116 | def train_network(_batch_size, _n_training_epochs, _lr): 117 | 118 | mnist_model = MNISTClassifier() 119 | 120 | opt = tf.keras.optimizers.Adam(_lr) 121 | 122 | train_loop(_batch_size, _n_training_epochs, mnist_model, opt) 123 | 124 | 125 | dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) 126 | dataset.shuffle(60000) 127 | 128 | batch_size = 512 129 | epochs = args.epochs 130 | lr = .01 131 | train_network(batch_size, epochs, lr) 132 | -------------------------------------------------------------------------------- /04_distributedLearning/Horovod/04_keras_cnn_verbose_hvd.py: -------------------------------------------------------------------------------- 1 | # Horovod example 2 | import os 3 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 4 | os.environ['MPICH_GPU_SUPPORT_ENABLED']='0' 5 | 6 | import tensorflow as tf 7 | 8 | import numpy 9 | import time 10 | 11 | #HVD: (1) Import horovod 12 | import horovod.tensorflow as hvd 13 | hvd.init() 14 | print("I am rank %d of %d"%(hvd.rank(), hvd.size())) 15 | 16 | import argparse 17 | parser = argparse.ArgumentParser(description='TensorFlow MNIST Example') 18 | parser.add_argument('--epochs', default=50, 19 | type=int, help='Number of epochs to run') 20 | 21 | parser.add_argument('--device', default='gpu', 22 | help='Wheter this is running on cpu or gpu') 23 | parser.add_argument('--num_inter', default=2, help='set number inter', type=int) 24 | parser.add_argument('--num_intra', default=0, help='set number intra', type=int) 25 | parser.add_argument('--batch_size', default=512, type=int) 26 | args = parser.parse_args() 27 | tf.config.threading.set_intra_op_parallelism_threads(args.num_intra) 28 | tf.config.threading.set_inter_op_parallelism_threads(args.num_inter) 29 | gpus = tf.config.experimental.list_physical_devices('GPU') 30 | for gpu in gpus: 31 | tf.config.experimental.set_memory_growth(gpu, True) 32 | #HVD: (2) Pin GPU 33 | if gpus: 34 | tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') 35 | 36 | # MNIST dataset 37 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data() 38 | 39 | x_train = x_train.astype(numpy.float32) 40 | x_test = x_test.astype(numpy.float32) 41 | 42 | x_train /= 255. 43 | x_test /= 255. 44 | 45 | y_train = y_train.astype(numpy.int32) 46 | y_test = y_test.astype(numpy.int32) 47 | 48 | dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) 49 | #HVD: sharding the dataset 50 | dataset = dataset.shard(hvd.size(), hvd.rank()) 51 | dataset = dataset.shuffle(60000) 52 | batches = dataset.batch(batch_size = args.batch_size, drop_remainder=True) 53 | 54 | # Convolutional model 55 | 56 | class MNISTClassifier(tf.keras.models.Model): 57 | 58 | def __init__(self, activation=tf.nn.tanh): 59 | tf.keras.models.Model.__init__(self) 60 | 61 | self.conv_1 = tf.keras.layers.Conv2D(32, [3, 3], activation='relu') 62 | self.conv_2 = tf.keras.layers.Conv2D(64, [3, 3], activation='relu') 63 | self.pool_3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2)) 64 | self.drop_4 = tf.keras.layers.Dropout(0.25) 65 | self.dense_5 = tf.keras.layers.Dense(128, activation='relu') 66 | self.drop_6 = tf.keras.layers.Dropout(0.5) 67 | self.dense_7 = tf.keras.layers.Dense(10, activation='softmax') 68 | 69 | def call(self, inputs): 70 | 71 | x = self.conv_1(inputs) 72 | x = self.conv_2(x) 73 | x = self.pool_3(x) 74 | x = self.drop_4(x) 75 | x = tf.keras.layers.Flatten()(x) 76 | x = self.dense_5(x) 77 | x = self.drop_6(x) 78 | x = self.dense_7(x) 79 | 80 | return x 81 | 82 | def compute_loss(y_true, y_pred): 83 | # if labels are integers, use sparse categorical crossentropy 84 | # network's final layer is softmax, so from_logtis=False 85 | scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False) 86 | # if labels are one-hot encoded, use standard crossentropy 87 | 88 | return scce(y_true, y_pred) 89 | 90 | 91 | def forward_pass(model, batch_data, y_true): 92 | y_pred = model(batch_data) 93 | loss = compute_loss(y_true, y_pred) 94 | return loss 95 | 96 | 97 | def train_loop(batch_size, n_training_epochs, model, opt): 98 | 99 | @tf.function() 100 | def train_iteration(data, y_true, model, opt): 101 | with tf.GradientTape() as tape: 102 | loss = forward_pass(model, data, y_true) 103 | 104 | trainable_vars = model.trainable_variables 105 | #HVD: (4) distributed tape 106 | tape = hvd.DistributedGradientTape(tape) 107 | 108 | # Apply the update to the network (one at a time): 109 | grads = tape.gradient(loss, trainable_vars) 110 | 111 | opt.apply_gradients(zip(grads, trainable_vars)) 112 | return loss 113 | 114 | for i_epoch in range(n_training_epochs): 115 | if (hvd.rank==0): 116 | print("beginning epoch %d" % i_epoch) 117 | start = time.time() 118 | total_loss = 0.0 119 | for i_batch, (batch_data, y_true) in enumerate(batches): 120 | batch_data = tf.reshape(batch_data, [-1, 28, 28, 1]) 121 | if (args.device=='cpu'): 122 | with tf.device("/cpu:0"): 123 | loss = train_iteration(batch_data, y_true, model, opt) 124 | else: 125 | loss = train_iteration(batch_data, y_true, model, opt) 126 | total_loss += loss 127 | #HVD: (5) broadcast from 0 (need to be done after first step to ensured that optimizer is initialized) 128 | if (i_batch==0 and i_epoch==0): 129 | hvd.broadcast_variables(model.variables, root_rank=0) 130 | hvd.broadcast_variables(opt.variables(), root_rank=0) 131 | #HVD: (6) average metrics 132 | total_loss = hvd.allreduce(total_loss, average=False) 133 | end = time.time() 134 | if (hvd.rank()==0): 135 | print("took %4.4f seconds for epoch #%d - %s" % (end-start, i_epoch, tf.print("loss: ", total_loss))) 136 | 137 | 138 | def train_network(_batch_size, _n_training_epochs, _lr): 139 | 140 | mnist_model = MNISTClassifier() 141 | #HVD: (3) scale learning rate 142 | opt = tf.keras.optimizers.Adam(_lr*hvd.size()) 143 | 144 | train_loop(_batch_size, _n_training_epochs, mnist_model, opt) 145 | 146 | 147 | batch_size = 512 148 | epochs = args.epochs 149 | lr = .01 150 | train_network(batch_size, epochs, lr) 151 | -------------------------------------------------------------------------------- /04_distributedLearning/Horovod/mpitrace/cpu/#mpi_profile.2500161.0#: -------------------------------------------------------------------------------- 1 | 2 | Data for MPI rank 0 of 8: 3 | Times and statistics from MPI_Init() to MPI_Finalize(). 4 | ----------------------------------------------------------------------- 5 | MPI Routine #calls avg. bytes time(sec) 6 | ----------------------------------------------------------------------- 7 | MPI_Comm_rank 5 0.0 0.000 8 | MPI_Comm_size 3 0.0 0.000 9 | MPI_Bcast 150 192058.6 0.021 10 | MPI_Allreduce 77340 44324.1 106.664 11 | MPI_Gather 50 4.0 0.003 12 | MPI_Gatherv 50 0.0 0.001 13 | MPI_Allgather 2 4.0 0.000 14 | ----------------------------------------------------------------- 15 | total communication time = 106.689 seconds. 16 | total elapsed time = 128.903 seconds. 17 | user cpu time = 2652.547 seconds. 18 | system time = 332.597 seconds. 19 | max resident set size = 1901.574 MBytes. 20 | 21 | Rank 7 reported the largest memory utilization : 1961.68 MBytes 22 | Rank 0 reported the largest elapsed time : 128.90 sec 23 | 24 | ----------------------------------------------------------------- 25 | Message size distributions: 26 | 27 | MPI_Bcast #calls avg. bytes time(sec) 28 | 50 4.0 0.000 29 | 2 8.0 0.000 30 | 39 25.0 0.000 31 | 6 40.0 0.000 32 | 6 128.0 0.000 33 | 7 245.9 0.000 34 | 11 406.3 0.000 35 | 1 553.0 0.000 36 | 8 1193.2 0.000 37 | 2 2829.0 0.000 38 | 6 5120.0 0.000 39 | 6 73728.0 0.003 40 | 6 4718592.0 0.017 41 | 42 | MPI_Allreduce #calls avg. bytes time(sec) 43 | 102 4.0 0.001 44 | 72893 16.0 86.606 45 | 16 40.0 0.003 46 | 709 128.0 0.093 47 | 709 256.0 0.149 48 | 10 512.0 0.002 49 | 76 552.0 0.075 50 | 713 1154.2 0.069 51 | 684 5607.9 0.645 52 | 714 73730.2 0.176 53 | 714 4718824.9 18.844 54 | 55 | MPI_Gather #calls avg. bytes time(sec) 56 | 50 4.0 0.003 57 | 58 | MPI_Allgather #calls avg. bytes time(sec) 59 | 2 4.0 0.000 60 | 61 | ----------------------------------------------------------------- 62 | 63 | Communication summary for all tasks: 64 | 65 | minimum communication time = 105.854 sec for task 4 66 | median communication time = 107.650 sec for task 1 67 | maximum communication time = 109.122 sec for task 5 68 | 69 | 70 | MPI timing summary for all ranks: 71 | taskid hostname comm(s) elapsed(s) user(s) system(s) size(MB) switches 72 | 0 thetagpu23 106.69 128.90 2652.55 332.60 1901.57 1791292 73 | 1 thetagpu23 107.65 128.90 2633.87 266.90 1879.50 1793841 74 | 2 thetagpu23 108.69 128.90 2683.13 236.28 1928.97 1795163 75 | 3 thetagpu23 109.09 128.90 2645.00 239.66 1892.77 1805047 76 | 4 thetagpu23 105.85 128.90 2616.02 262.91 1864.30 1785421 77 | 5 thetagpu23 109.12 128.90 2625.73 271.42 1870.32 1796652 78 | 6 thetagpu23 107.05 128.90 2619.91 294.70 1865.26 1810114 79 | 7 thetagpu23 107.69 128.90 2652.56 236.55 1961.68 1795444 80 | -------------------------------------------------------------------------------- /04_distributedLearning/Horovod/mpitrace/cpu/mpi_profile.2500161.0: -------------------------------------------------------------------------------- 1 | Data for MPI rank 0 of 8: 2 | Times and statistics from MPI_Init() to MPI_Finalize(). 3 | ----------------------------------------------------------------------- 4 | MPI Routine #calls avg. bytes time(sec) 5 | ----------------------------------------------------------------------- 6 | MPI_Comm_rank 5 0.0 0.000 7 | MPI_Comm_size 3 0.0 0.000 8 | MPI_Bcast 150 192058.6 0.021 9 | MPI_Allreduce 77340 44324.1 106.664 10 | MPI_Gather 50 4.0 0.003 11 | MPI_Gatherv 50 0.0 0.001 12 | MPI_Allgather 2 4.0 0.000 13 | ----------------------------------------------------------------- 14 | total communication time = 106.689 seconds. 15 | total elapsed time = 128.903 seconds. 16 | user cpu time = 2652.547 seconds. 17 | system time = 332.597 seconds. 18 | max resident set size = 1901.574 MBytes. 19 | 20 | Rank 7 reported the largest memory utilization : 1961.68 MBytes 21 | Rank 0 reported the largest elapsed time : 128.90 sec 22 | 23 | ----------------------------------------------------------------- 24 | Message size distributions: 25 | 26 | MPI_Bcast #calls avg. bytes time(sec) 27 | 50 4.0 0.000 28 | 2 8.0 0.000 29 | 39 25.0 0.000 30 | 6 40.0 0.000 31 | 6 128.0 0.000 32 | 7 245.9 0.000 33 | 11 406.3 0.000 34 | 1 553.0 0.000 35 | 8 1193.2 0.000 36 | 2 2829.0 0.000 37 | 6 5120.0 0.000 38 | 6 73728.0 0.003 39 | 6 4718592.0 0.017 40 | 41 | MPI_Allreduce #calls avg. bytes time(sec) 42 | 102 4.0 0.001 43 | 72893 16.0 86.606 44 | 16 40.0 0.003 45 | 709 128.0 0.093 46 | 709 256.0 0.149 47 | 10 512.0 0.002 48 | 76 552.0 0.075 49 | 713 1154.2 0.069 50 | 684 5607.9 0.645 51 | 714 73730.2 0.176 52 | 714 4718824.9 18.844 53 | 54 | MPI_Gather #calls avg. bytes time(sec) 55 | 50 4.0 0.003 56 | 57 | MPI_Allgather #calls avg. bytes time(sec) 58 | 2 4.0 0.000 59 | 60 | ----------------------------------------------------------------- 61 | 62 | Communication summary for all tasks: 63 | 64 | minimum communication time = 105.854 sec for task 4 65 | median communication time = 107.650 sec for task 1 66 | maximum communication time = 109.122 sec for task 5 67 | 68 | 69 | MPI timing summary for all ranks: 70 | taskid hostname comm(s) elapsed(s) user(s) system(s) size(MB) switches 71 | 0 thetagpu23 106.69 128.90 2652.55 332.60 1901.57 1791292 72 | 1 thetagpu23 107.65 128.90 2633.87 266.90 1879.50 1793841 73 | 2 thetagpu23 108.69 128.90 2683.13 236.28 1928.97 1795163 74 | 3 thetagpu23 109.09 128.90 2645.00 239.66 1892.77 1805047 75 | 4 thetagpu23 105.85 128.90 2616.02 262.91 1864.30 1785421 76 | 5 thetagpu23 109.12 128.90 2625.73 271.42 1870.32 1796652 77 | 6 thetagpu23 107.05 128.90 2619.91 294.70 1865.26 1810114 78 | 7 thetagpu23 107.69 128.90 2652.56 236.55 1961.68 1795444 79 | -------------------------------------------------------------------------------- /04_distributedLearning/Horovod/mpitrace/cpu/mpi_profile.2500161.1: -------------------------------------------------------------------------------- 1 | Data for MPI rank 1 of 8: 2 | Times and statistics from MPI_Init() to MPI_Finalize(). 3 | ----------------------------------------------------------------------- 4 | MPI Routine #calls avg. bytes time(sec) 5 | ----------------------------------------------------------------------- 6 | MPI_Comm_rank 5 0.0 0.000 7 | MPI_Comm_size 3 0.0 0.000 8 | MPI_Bcast 150 192058.6 0.027 9 | MPI_Allreduce 77340 44324.1 107.623 10 | MPI_Gather 50 4.0 0.000 11 | MPI_Gatherv 50 187.9 0.000 12 | MPI_Allgather 2 4.0 0.000 13 | ----------------------------------------------------------------- 14 | MPI task 1 of 8 had the median communication time. 15 | total communication time = 107.650 seconds. 16 | total elapsed time = 128.903 seconds. 17 | user cpu time = 2633.868 seconds. 18 | system time = 266.897 seconds. 19 | max resident set size = 1879.500 MBytes. 20 | 21 | Rank 7 reported the largest memory utilization : 1961.68 MBytes 22 | Rank 0 reported the largest elapsed time : 128.90 sec 23 | 24 | ----------------------------------------------------------------- 25 | Message size distributions: 26 | 27 | MPI_Bcast #calls avg. bytes time(sec) 28 | 50 4.0 0.005 29 | 2 8.0 0.000 30 | 39 25.0 0.000 31 | 6 40.0 0.000 32 | 6 128.0 0.000 33 | 7 245.9 0.000 34 | 11 406.3 0.000 35 | 1 553.0 0.000 36 | 8 1193.2 0.000 37 | 2 2829.0 0.000 38 | 6 5120.0 0.000 39 | 6 73728.0 0.003 40 | 6 4718592.0 0.018 41 | 42 | MPI_Allreduce #calls avg. bytes time(sec) 43 | 102 4.0 0.001 44 | 72893 16.0 87.583 45 | 16 40.0 0.003 46 | 709 128.0 0.099 47 | 709 256.0 0.181 48 | 10 512.0 0.004 49 | 76 552.0 0.063 50 | 713 1154.2 0.072 51 | 684 5607.9 0.591 52 | 714 73730.2 0.178 53 | 714 4718824.9 18.849 54 | 55 | MPI_Gather #calls avg. bytes time(sec) 56 | 50 4.0 0.000 57 | 58 | MPI_Gatherv #calls avg. bytes time(sec) 59 | 39 25.0 0.000 60 | 1 121.0 0.000 61 | 4 239.0 0.000 62 | 1 257.0 0.000 63 | 1 601.0 0.000 64 | 2 1033.0 0.000 65 | 2 2209.0 0.000 66 | 67 | MPI_Allgather #calls avg. bytes time(sec) 68 | 2 4.0 0.000 69 | 70 | -------------------------------------------------------------------------------- /04_distributedLearning/Horovod/mpitrace/cpu/mpi_profile.2500161.4: -------------------------------------------------------------------------------- 1 | Data for MPI rank 4 of 8: 2 | Times and statistics from MPI_Init() to MPI_Finalize(). 3 | ----------------------------------------------------------------------- 4 | MPI Routine #calls avg. bytes time(sec) 5 | ----------------------------------------------------------------------- 6 | MPI_Comm_rank 5 0.0 0.000 7 | MPI_Comm_size 3 0.0 0.000 8 | MPI_Bcast 150 192058.6 0.026 9 | MPI_Allreduce 77340 44324.1 105.827 10 | MPI_Gather 50 4.0 0.000 11 | MPI_Gatherv 50 188.2 0.000 12 | MPI_Allgather 2 4.0 0.000 13 | ----------------------------------------------------------------- 14 | MPI task 4 of 8 had the minimum communication time. 15 | total communication time = 105.854 seconds. 16 | total elapsed time = 128.903 seconds. 17 | user cpu time = 2616.017 seconds. 18 | system time = 262.911 seconds. 19 | max resident set size = 1864.305 MBytes. 20 | 21 | Rank 7 reported the largest memory utilization : 1961.68 MBytes 22 | Rank 0 reported the largest elapsed time : 128.90 sec 23 | 24 | ----------------------------------------------------------------- 25 | Message size distributions: 26 | 27 | MPI_Bcast #calls avg. bytes time(sec) 28 | 50 4.0 0.005 29 | 2 8.0 0.000 30 | 39 25.0 0.000 31 | 6 40.0 0.000 32 | 6 128.0 0.000 33 | 7 245.9 0.000 34 | 11 406.3 0.000 35 | 1 553.0 0.000 36 | 8 1193.2 0.000 37 | 2 2829.0 0.000 38 | 6 5120.0 0.000 39 | 6 73728.0 0.002 40 | 6 4718592.0 0.019 41 | 42 | MPI_Allreduce #calls avg. bytes time(sec) 43 | 102 4.0 0.001 44 | 72893 16.0 85.829 45 | 16 40.0 0.003 46 | 709 128.0 0.099 47 | 709 256.0 0.092 48 | 10 512.0 0.004 49 | 76 552.0 0.075 50 | 713 1154.2 0.069 51 | 684 5607.9 0.610 52 | 714 73730.2 0.178 53 | 714 4718824.9 18.866 54 | 55 | MPI_Gather #calls avg. bytes time(sec) 56 | 50 4.0 0.000 57 | 58 | MPI_Gatherv #calls avg. bytes time(sec) 59 | 39 25.0 0.000 60 | 1 121.0 0.000 61 | 4 239.0 0.000 62 | 1 257.0 0.000 63 | 1 601.0 0.000 64 | 2 1041.0 0.000 65 | 2 2209.0 0.000 66 | 67 | MPI_Allgather #calls avg. bytes time(sec) 68 | 2 4.0 0.000 69 | 70 | -------------------------------------------------------------------------------- /04_distributedLearning/Horovod/mpitrace/cpu/mpi_profile.2500161.5: -------------------------------------------------------------------------------- 1 | Data for MPI rank 5 of 8: 2 | Times and statistics from MPI_Init() to MPI_Finalize(). 3 | ----------------------------------------------------------------------- 4 | MPI Routine #calls avg. bytes time(sec) 5 | ----------------------------------------------------------------------- 6 | MPI_Comm_rank 5 0.0 0.000 7 | MPI_Comm_size 3 0.0 0.000 8 | MPI_Bcast 150 192058.6 0.027 9 | MPI_Allreduce 77340 44324.1 109.094 10 | MPI_Gather 50 4.0 0.000 11 | MPI_Gatherv 50 187.4 0.000 12 | MPI_Allgather 2 4.0 0.000 13 | ----------------------------------------------------------------- 14 | MPI task 5 of 8 had the maximum communication time. 15 | total communication time = 109.122 seconds. 16 | total elapsed time = 128.903 seconds. 17 | user cpu time = 2625.732 seconds. 18 | system time = 271.419 seconds. 19 | max resident set size = 1870.316 MBytes. 20 | 21 | Rank 7 reported the largest memory utilization : 1961.68 MBytes 22 | Rank 0 reported the largest elapsed time : 128.90 sec 23 | 24 | ----------------------------------------------------------------- 25 | Message size distributions: 26 | 27 | MPI_Bcast #calls avg. bytes time(sec) 28 | 50 4.0 0.005 29 | 2 8.0 0.000 30 | 39 25.0 0.000 31 | 6 40.0 0.000 32 | 6 128.0 0.000 33 | 7 245.9 0.000 34 | 11 406.3 0.000 35 | 1 553.0 0.000 36 | 8 1193.2 0.000 37 | 2 2829.0 0.000 38 | 6 5120.0 0.000 39 | 6 73728.0 0.002 40 | 6 4718592.0 0.019 41 | 42 | MPI_Allreduce #calls avg. bytes time(sec) 43 | 102 4.0 0.001 44 | 72893 16.0 89.051 45 | 16 40.0 0.003 46 | 709 128.0 0.096 47 | 709 256.0 0.182 48 | 10 512.0 0.004 49 | 76 552.0 0.056 50 | 713 1154.2 0.068 51 | 684 5607.9 0.627 52 | 714 73730.2 0.176 53 | 714 4718824.9 18.831 54 | 55 | MPI_Gather #calls avg. bytes time(sec) 56 | 50 4.0 0.000 57 | 58 | MPI_Gatherv #calls avg. bytes time(sec) 59 | 40 25.0 0.000 60 | 1 121.0 0.000 61 | 2 237.0 0.000 62 | 2 345.0 0.000 63 | 2 809.0 0.000 64 | 1 1041.0 0.000 65 | 2 2213.0 0.000 66 | 67 | MPI_Allgather #calls avg. bytes time(sec) 68 | 2 4.0 0.000 69 | 70 | -------------------------------------------------------------------------------- /04_distributedLearning/Horovod/mpitrace/gpu/mpi_profile.2497205.0: -------------------------------------------------------------------------------- 1 | Data for MPI rank 0 of 8: 2 | Times and statistics from MPI_Init() to MPI_Finalize(). 3 | ----------------------------------------------------------------------- 4 | MPI Routine #calls avg. bytes time(sec) 5 | ----------------------------------------------------------------------- 6 | MPI_Comm_rank 5 0.0 0.000 7 | MPI_Comm_size 3 0.0 0.000 8 | MPI_Bcast 111 259530.7 0.019 9 | MPI_Barrier 1 0.0 0.081 10 | MPI_Allreduce 12506 15.9 5.887 11 | MPI_Gather 30 4.0 0.006 12 | MPI_Gatherv 30 0.0 0.000 13 | MPI_Allgather 2 4.0 0.000 14 | ----------------------------------------------------------------- 15 | total communication time = 5.994 seconds. 16 | total elapsed time = 18.068 seconds. 17 | user cpu time = 19.887 seconds. 18 | system time = 16.860 seconds. 19 | max resident set size = 4690.930 MBytes. 20 | 21 | Rank 2 reported the largest memory utilization : 4738.88 MBytes 22 | Rank 5 reported the largest elapsed time : 18.19 sec 23 | 24 | ----------------------------------------------------------------- 25 | Message size distributions: 26 | 27 | MPI_Bcast #calls avg. bytes time(sec) 28 | 30 4.0 0.000 29 | 2 8.0 0.000 30 | 20 25.0 0.000 31 | 6 40.0 0.000 32 | 7 128.0 0.000 33 | 8 241.2 0.000 34 | 8 476.2 0.000 35 | 3 793.0 0.000 36 | 7 1149.9 0.000 37 | 2 2665.0 0.000 38 | 6 5120.0 0.000 39 | 6 73728.0 0.003 40 | 6 4718592.0 0.015 41 | 42 | MPI_Allreduce #calls avg. bytes time(sec) 43 | 102 4.0 0.001 44 | 12404 16.0 5.885 45 | 46 | MPI_Gather #calls avg. bytes time(sec) 47 | 30 4.0 0.006 48 | 49 | MPI_Allgather #calls avg. bytes time(sec) 50 | 2 4.0 0.000 51 | 52 | ----------------------------------------------------------------- 53 | 54 | Communication summary for all tasks: 55 | 56 | minimum communication time = 3.775 sec for task 5 57 | median communication time = 5.787 sec for task 6 58 | maximum communication time = 6.732 sec for task 2 59 | 60 | 61 | MPI timing summary for all ranks: 62 | taskid hostname comm(s) elapsed(s) user(s) system(s) size(MB) switches 63 | 0 thetagpu23 5.99 18.07 19.89 16.86 4690.93 102621 64 | 1 thetagpu23 6.57 18.19 16.87 2.96 4687.09 102385 65 | 2 thetagpu23 6.73 18.19 16.84 3.12 4738.88 100802 66 | 3 thetagpu23 6.59 18.19 16.60 3.16 4704.07 101685 67 | 4 thetagpu23 4.68 18.19 14.43 3.34 4692.63 102432 68 | 5 thetagpu23 3.77 18.19 13.95 3.24 4679.32 100831 69 | 6 thetagpu23 5.79 18.19 16.15 2.98 4687.98 102823 70 | 7 thetagpu23 4.49 18.07 14.63 3.16 4684.28 102118 71 | -------------------------------------------------------------------------------- /04_distributedLearning/Horovod/mpitrace/gpu/mpi_profile.2497205.2: -------------------------------------------------------------------------------- 1 | Data for MPI rank 2 of 8: 2 | Times and statistics from MPI_Init() to MPI_Finalize(). 3 | ----------------------------------------------------------------------- 4 | MPI Routine #calls avg. bytes time(sec) 5 | ----------------------------------------------------------------------- 6 | MPI_Comm_rank 5 0.0 0.000 7 | MPI_Comm_size 3 0.0 0.000 8 | MPI_Bcast 111 259530.7 0.126 9 | MPI_Barrier 1 0.0 0.001 10 | MPI_Allreduce 12506 15.9 6.606 11 | MPI_Gather 30 4.0 0.000 12 | MPI_Gatherv 30 295.4 0.000 13 | MPI_Allgather 2 4.0 0.000 14 | ----------------------------------------------------------------- 15 | MPI task 2 of 8 had the maximum communication time. 16 | total communication time = 6.732 seconds. 17 | total elapsed time = 18.190 seconds. 18 | user cpu time = 16.840 seconds. 19 | system time = 3.117 seconds. 20 | max resident set size = 4738.883 MBytes. 21 | 22 | Rank 2 reported the largest memory utilization : 4738.88 MBytes 23 | Rank 5 reported the largest elapsed time : 18.19 sec 24 | 25 | ----------------------------------------------------------------- 26 | Message size distributions: 27 | 28 | MPI_Bcast #calls avg. bytes time(sec) 29 | 30 4.0 0.005 30 | 2 8.0 0.000 31 | 20 25.0 0.000 32 | 6 40.0 0.000 33 | 7 128.0 0.101 34 | 8 241.2 0.000 35 | 8 476.2 0.000 36 | 3 793.0 0.000 37 | 7 1149.9 0.000 38 | 2 2665.0 0.000 39 | 6 5120.0 0.000 40 | 6 73728.0 0.003 41 | 6 4718592.0 0.016 42 | 43 | MPI_Allreduce #calls avg. bytes time(sec) 44 | 102 4.0 0.001 45 | 12404 16.0 6.604 46 | 47 | MPI_Gather #calls avg. bytes time(sec) 48 | 30 4.0 0.000 49 | 50 | MPI_Gatherv #calls avg. bytes time(sec) 51 | 20 25.0 0.000 52 | 1 121.0 0.000 53 | 1 161.0 0.000 54 | 2 357.0 0.000 55 | 3 723.7 0.000 56 | 2 1509.0 0.000 57 | 1 2177.0 0.000 58 | 59 | MPI_Allgather #calls avg. bytes time(sec) 60 | 2 4.0 0.000 61 | 62 | -------------------------------------------------------------------------------- /04_distributedLearning/Horovod/mpitrace/gpu/mpi_profile.2497205.5: -------------------------------------------------------------------------------- 1 | Data for MPI rank 5 of 8: 2 | Times and statistics from MPI_Init() to MPI_Finalize(). 3 | ----------------------------------------------------------------------- 4 | MPI Routine #calls avg. bytes time(sec) 5 | ----------------------------------------------------------------------- 6 | MPI_Comm_rank 5 0.0 0.000 7 | MPI_Comm_size 3 0.0 0.000 8 | MPI_Bcast 111 259530.7 0.128 9 | MPI_Barrier 1 0.0 0.000 10 | MPI_Allreduce 12506 15.9 3.645 11 | MPI_Gather 30 4.0 0.000 12 | MPI_Gatherv 30 295.7 0.000 13 | MPI_Allgather 2 4.0 0.000 14 | ----------------------------------------------------------------- 15 | MPI task 5 of 8 had the minimum communication time. 16 | total communication time = 3.775 seconds. 17 | total elapsed time = 18.190 seconds. 18 | user cpu time = 13.953 seconds. 19 | system time = 3.244 seconds. 20 | max resident set size = 4679.320 MBytes. 21 | 22 | Rank 2 reported the largest memory utilization : 4738.88 MBytes 23 | Rank 5 reported the largest elapsed time : 18.19 sec 24 | 25 | ----------------------------------------------------------------- 26 | Message size distributions: 27 | 28 | MPI_Bcast #calls avg. bytes time(sec) 29 | 30 4.0 0.007 30 | 2 8.0 0.000 31 | 20 25.0 0.000 32 | 6 40.0 0.000 33 | 7 128.0 0.101 34 | 8 241.2 0.000 35 | 8 476.2 0.000 36 | 3 793.0 0.000 37 | 7 1149.9 0.000 38 | 2 2665.0 0.000 39 | 6 5120.0 0.000 40 | 6 73728.0 0.003 41 | 6 4718592.0 0.017 42 | 43 | MPI_Allreduce #calls avg. bytes time(sec) 44 | 102 4.0 0.001 45 | 12404 16.0 3.644 46 | 47 | MPI_Gather #calls avg. bytes time(sec) 48 | 30 4.0 0.000 49 | 50 | MPI_Gatherv #calls avg. bytes time(sec) 51 | 20 25.0 0.000 52 | 1 121.0 0.000 53 | 3 339.7 0.000 54 | 3 798.3 0.000 55 | 3 1611.7 0.000 56 | 57 | MPI_Allgather #calls avg. bytes time(sec) 58 | 2 4.0 0.000 59 | 60 | -------------------------------------------------------------------------------- /04_distributedLearning/Horovod/mpitrace/gpu/mpi_profile.2497205.6: -------------------------------------------------------------------------------- 1 | Data for MPI rank 6 of 8: 2 | Times and statistics from MPI_Init() to MPI_Finalize(). 3 | ----------------------------------------------------------------------- 4 | MPI Routine #calls avg. bytes time(sec) 5 | ----------------------------------------------------------------------- 6 | MPI_Comm_rank 5 0.0 0.000 7 | MPI_Comm_size 3 0.0 0.000 8 | MPI_Bcast 111 259530.7 0.127 9 | MPI_Barrier 1 0.0 0.001 10 | MPI_Allreduce 12506 15.9 5.658 11 | MPI_Gather 30 4.0 0.000 12 | MPI_Gatherv 30 293.8 0.000 13 | MPI_Allgather 2 4.0 0.000 14 | ----------------------------------------------------------------- 15 | MPI task 6 of 8 had the median communication time. 16 | total communication time = 5.787 seconds. 17 | total elapsed time = 18.190 seconds. 18 | user cpu time = 16.145 seconds. 19 | system time = 2.975 seconds. 20 | max resident set size = 4687.980 MBytes. 21 | 22 | Rank 2 reported the largest memory utilization : 4738.88 MBytes 23 | Rank 5 reported the largest elapsed time : 18.19 sec 24 | 25 | ----------------------------------------------------------------- 26 | Message size distributions: 27 | 28 | MPI_Bcast #calls avg. bytes time(sec) 29 | 30 4.0 0.005 30 | 2 8.0 0.000 31 | 20 25.0 0.000 32 | 6 40.0 0.000 33 | 7 128.0 0.101 34 | 8 241.2 0.000 35 | 8 476.2 0.000 36 | 3 793.0 0.000 37 | 7 1149.9 0.000 38 | 2 2665.0 0.000 39 | 6 5120.0 0.000 40 | 6 73728.0 0.003 41 | 6 4718592.0 0.017 42 | 43 | MPI_Allreduce #calls avg. bytes time(sec) 44 | 102 4.0 0.001 45 | 12404 16.0 5.657 46 | 47 | MPI_Gather #calls avg. bytes time(sec) 48 | 30 4.0 0.000 49 | 50 | MPI_Gatherv #calls avg. bytes time(sec) 51 | 21 25.0 0.000 52 | 1 121.0 0.000 53 | 3 369.0 0.000 54 | 2 805.0 0.000 55 | 1 1049.0 0.000 56 | 2 2201.0 0.000 57 | 58 | MPI_Allgather #calls avg. bytes time(sec) 59 | 2 4.0 0.000 60 | 61 | -------------------------------------------------------------------------------- /04_distributedLearning/figures/Horovod.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/04_distributedLearning/figures/Horovod.png -------------------------------------------------------------------------------- /04_distributedLearning/figures/cpu_horovodtimeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/04_distributedLearning/figures/cpu_horovodtimeline.png -------------------------------------------------------------------------------- /04_distributedLearning/figures/distributed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/04_distributedLearning/figures/distributed.png -------------------------------------------------------------------------------- /04_distributedLearning/figures/gpu_horovodtimeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/04_distributedLearning/figures/gpu_horovodtimeline.png -------------------------------------------------------------------------------- /04_distributedLearning/figures/resnet50.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/04_distributedLearning/figures/resnet50.png -------------------------------------------------------------------------------- /04_distributedLearning/results/concise_1.out: -------------------------------------------------------------------------------- 1 | I am rank 0 of 1 2 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440 3 | warnings.warn( 4 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0036s vs `on_train_batch_end` time: 0.0357s). Check your callbacks. 5 | 117/117 - 7s - loss: 0.3079 - accuracy: 0.9042 6 | Epoch 1/20 7 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0036s vs `on_train_batch_end` time: 0.0189s). Check your callbacks. 8 | 117/117 - 1s - loss: 0.2835 - accuracy: 0.9120 9 | Epoch 2/20 10 | 117/117 - 1s - loss: 0.0888 - accuracy: 0.9743 11 | Epoch 3/20 12 | 117/117 - 1s - loss: 0.0683 - accuracy: 0.9793 13 | 14 | Epoch 3: finished gradual learning rate warmup to 0.01. 15 | Epoch 4/20 16 | 117/117 - 1s - loss: 0.0656 - accuracy: 0.9799 17 | Epoch 5/20 18 | 117/117 - 1s - loss: 0.0513 - accuracy: 0.9845 19 | Epoch 6/20 20 | 117/117 - 1s - loss: 0.0520 - accuracy: 0.9839 21 | Epoch 7/20 22 | 117/117 - 1s - loss: 0.0477 - accuracy: 0.9848 23 | Epoch 8/20 24 | 117/117 - 1s - loss: 0.0423 - accuracy: 0.9864 25 | Epoch 9/20 26 | 117/117 - 1s - loss: 0.0409 - accuracy: 0.9870 27 | Epoch 10/20 28 | 117/117 - 1s - loss: 0.0389 - accuracy: 0.9879 29 | Epoch 11/20 30 | 117/117 - 1s - loss: 0.0381 - accuracy: 0.9880 31 | Epoch 12/20 32 | 117/117 - 1s - loss: 0.0406 - accuracy: 0.9874 33 | Epoch 13/20 34 | 117/117 - 1s - loss: 0.0416 - accuracy: 0.9872 35 | Epoch 14/20 36 | 117/117 - 1s - loss: 0.0351 - accuracy: 0.9892 37 | Epoch 15/20 38 | 117/117 - 1s - loss: 0.0356 - accuracy: 0.9887 39 | Epoch 16/20 40 | 117/117 - 1s - loss: 0.0398 - accuracy: 0.9875 41 | Epoch 17/20 42 | 117/117 - 1s - loss: 0.0395 - accuracy: 0.9881 43 | Epoch 18/20 44 | 117/117 - 1s - loss: 0.0371 - accuracy: 0.9884 45 | Epoch 19/20 46 | 117/117 - 1s - loss: 0.0361 - accuracy: 0.9888 47 | Epoch 20/20 48 | 117/117 - 1s - loss: 0.0411 - accuracy: 0.9876 49 | Total time: 13.148040294647217 second 50 | -------------------------------------------------------------------------------- /04_distributedLearning/results/concise_2.out: -------------------------------------------------------------------------------- 1 | I am rank 1 of 2 2 | I am rank 0 of 2 3 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440 4 | warnings.warn( 5 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440 6 | warnings.warn( 7 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0071s vs `on_train_batch_end` time: 0.0387s). Check your callbacks. 8 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0069s vs `on_train_batch_end` time: 0.0388s). Check your callbacks. 9 | 58/58 - 8s - loss: 0.3714 - accuracy: 0.8798 10 | Epoch 1/20 11 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0060s vs `on_train_batch_end` time: 0.0178s). Check your callbacks. 12 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0058s vs `on_train_batch_end` time: 0.0180s). Check your callbacks. 13 | 58/58 - 1s - loss: 0.4186 - accuracy: 0.8662 14 | Epoch 2/20 15 | 58/58 - 0s - loss: 0.0983 - accuracy: 0.9711 16 | Epoch 3/20 17 | 58/58 - 0s - loss: 0.0706 - accuracy: 0.9788 18 | 19 | Epoch 3: finished gradual learning rate warmup to 0.01. 20 | Epoch 4/20 21 | 58/58 - 0s - loss: 0.0641 - accuracy: 0.9796 22 | Epoch 5/20 23 | 58/58 - 0s - loss: 0.0584 - accuracy: 0.9826 24 | Epoch 6/20 25 | 58/58 - 0s - loss: 0.0485 - accuracy: 0.9840 26 | Epoch 7/20 27 | 58/58 - 0s - loss: 0.0398 - accuracy: 0.9872 28 | Epoch 8/20 29 | 58/58 - 0s - loss: 0.0426 - accuracy: 0.9869 30 | Epoch 9/20 31 | 58/58 - 0s - loss: 0.0432 - accuracy: 0.9863 32 | Epoch 10/20 33 | 58/58 - 0s - loss: 0.0350 - accuracy: 0.9885 34 | Epoch 11/20 35 | 58/58 - 0s - loss: 0.0322 - accuracy: 0.9898 36 | Epoch 12/20 37 | 58/58 - 0s - loss: 0.0269 - accuracy: 0.9908 38 | Epoch 13/20 39 | 58/58 - 0s - loss: 0.0317 - accuracy: 0.9898 40 | Epoch 14/20 41 | 58/58 - 0s - loss: 0.0285 - accuracy: 0.9905 42 | Epoch 15/20 43 | 58/58 - 0s - loss: 0.0276 - accuracy: 0.9908 44 | Epoch 16/20 45 | 58/58 - 0s - loss: 0.0280 - accuracy: 0.9904 46 | Epoch 17/20 47 | 58/58 - 0s - loss: 0.0247 - accuracy: 0.9921 48 | Epoch 18/20 49 | 58/58 - 0s - loss: 0.0276 - accuracy: 0.9911 50 | Epoch 19/20 51 | 58/58 - 0s - loss: 0.0266 - accuracy: 0.9914 52 | Epoch 20/20 53 | 58/58 - 0s - loss: 0.0234 - accuracy: 0.9922 54 | Total time: 8.65635347366333 second 55 | -------------------------------------------------------------------------------- /04_distributedLearning/results/concise_4.out: -------------------------------------------------------------------------------- 1 | I am rank 0 of 4 2 | I am rank 1 of 4 3 | I am rank 2 of 4 4 | I am rank 3 of 4 5 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440 6 | warnings.warn( 7 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440 8 | warnings.warn( 9 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440 10 | warnings.warn( 11 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440 12 | warnings.warn( 13 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0054s vs `on_train_batch_end` time: 0.0286s). Check your callbacks. 14 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0053s vs `on_train_batch_end` time: 0.0286s). Check your callbacks. 15 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0054s vs `on_train_batch_end` time: 0.0285s). Check your callbacks. 16 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0054s vs `on_train_batch_end` time: 0.0285s). Check your callbacks. 17 | 29/29 - 7s - loss: 0.6509 - accuracy: 0.7879 18 | Epoch 1/20 19 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0055s vs `on_train_batch_end` time: 0.0151s). Check your callbacks. 20 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0055s vs `on_train_batch_end` time: 0.0151s). Check your callbacks. 21 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0054s vs `on_train_batch_end` time: 0.0151s). Check your callbacks. 22 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0055s vs `on_train_batch_end` time: 0.0151s). Check your callbacks. 23 | 29/29 - 1s - loss: 0.6974 - accuracy: 0.7748 24 | Epoch 2/20 25 | 29/29 - 0s - loss: 0.1578 - accuracy: 0.9548 26 | Epoch 3/20 27 | 29/29 - 0s - loss: 0.0885 - accuracy: 0.9740 28 | 29 | Epoch 3: finished gradual learning rate warmup to 0.01. 30 | Epoch 4/20 31 | 29/29 - 0s - loss: 0.0667 - accuracy: 0.9806 32 | Epoch 5/20 33 | 29/29 - 0s - loss: 0.0674 - accuracy: 0.9795 34 | Epoch 6/20 35 | 29/29 - 0s - loss: 0.0519 - accuracy: 0.9839 36 | Epoch 7/20 37 | 29/29 - 0s - loss: 0.0482 - accuracy: 0.9857 38 | Epoch 8/20 39 | 29/29 - 0s - loss: 0.0408 - accuracy: 0.9874 40 | Epoch 9/20 41 | 29/29 - 0s - loss: 0.0349 - accuracy: 0.9899 42 | Epoch 10/20 43 | 29/29 - 0s - loss: 0.0336 - accuracy: 0.9883 44 | Epoch 11/20 45 | 29/29 - 0s - loss: 0.0308 - accuracy: 0.9902 46 | Epoch 12/20 47 | 29/29 - 0s - loss: 0.0275 - accuracy: 0.9911 48 | Epoch 13/20 49 | 29/29 - 0s - loss: 0.0234 - accuracy: 0.9927 50 | Epoch 14/20 51 | 29/29 - 0s - loss: 0.0218 - accuracy: 0.9925 52 | Epoch 15/20 53 | 29/29 - 0s - loss: 0.0255 - accuracy: 0.9917 54 | Epoch 16/20 55 | 29/29 - 0s - loss: 0.0284 - accuracy: 0.9906 56 | Epoch 17/20 57 | 29/29 - 0s - loss: 0.0203 - accuracy: 0.9935 58 | Epoch 18/20 59 | 29/29 - 0s - loss: 0.0240 - accuracy: 0.9923 60 | Epoch 19/20 61 | 29/29 - 0s - loss: 0.0189 - accuracy: 0.9935 62 | Epoch 20/20 63 | 29/29 - 0s - loss: 0.0203 - accuracy: 0.9924 64 | Total time: 3.7076730728149414 second 65 | -------------------------------------------------------------------------------- /04_distributedLearning/results/concise_8.out: -------------------------------------------------------------------------------- 1 | I am rank 4 of 8 2 | I am rank 0 of 8 3 | I am rank 1 of 8 4 | I am rank 2 of 8 5 | I am rank 3 of 8 6 | I am rank 5 of 8 7 | I am rank 6 of 8 8 | I am rank 7 of 8 9 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440 10 | warnings.warn( 11 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440 12 | warnings.warn( 13 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440 14 | warnings.warn( 15 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440 16 | warnings.warn( 17 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440 18 | warnings.warn( 19 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440 20 | warnings.warn( 21 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440 22 | warnings.warn( 23 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440 24 | warnings.warn( 25 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0060s vs `on_train_batch_end` time: 0.0308s). Check your callbacks. 26 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0059s vs `on_train_batch_end` time: 0.0309s). Check your callbacks. 27 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0059s vs `on_train_batch_end` time: 0.0308s). Check your callbacks. 28 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0062s vs `on_train_batch_end` time: 0.0308s). Check your callbacks. 29 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0061s vs `on_train_batch_end` time: 0.0307s). Check your callbacks. 30 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0061s vs `on_train_batch_end` time: 0.0308s). Check your callbacks. 31 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0060s vs `on_train_batch_end` time: 0.0308s). Check your callbacks. 32 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0061s vs `on_train_batch_end` time: 0.0308s). Check your callbacks. 33 | 14/14 - 11s - loss: 1.0856 - accuracy: 0.6493 34 | Epoch 1/20 35 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0057s vs `on_train_batch_end` time: 0.0170s). Check your callbacks. 36 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0056s vs `on_train_batch_end` time: 0.0169s). Check your callbacks. 37 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0057s vs `on_train_batch_end` time: 0.0169s). Check your callbacks. 38 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0056s vs `on_train_batch_end` time: 0.0169s). Check your callbacks. 39 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0057s vs `on_train_batch_end` time: 0.0170s). Check your callbacks. 40 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0057s vs `on_train_batch_end` time: 0.0169s). Check your callbacks. 41 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0057s vs `on_train_batch_end` time: 0.0169s). Check your callbacks. 42 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0057s vs `on_train_batch_end` time: 0.0170s). Check your callbacks. 43 | 14/14 - 1s - loss: 1.1651 - accuracy: 0.6348 44 | Epoch 2/20 45 | 14/14 - 0s - loss: 0.3157 - accuracy: 0.9061 46 | Epoch 3/20 47 | 14/14 - 0s - loss: 0.1135 - accuracy: 0.9653 48 | 49 | Epoch 3: finished gradual learning rate warmup to 0.01. 50 | Epoch 4/20 51 | 14/14 - 0s - loss: 0.1007 - accuracy: 0.9700 52 | Epoch 5/20 53 | 14/14 - 0s - loss: 0.0634 - accuracy: 0.9817 54 | Epoch 6/20 55 | 14/14 - 0s - loss: 0.0586 - accuracy: 0.9802 56 | Epoch 7/20 57 | 14/14 - 0s - loss: 0.0530 - accuracy: 0.9859 58 | Epoch 8/20 59 | 14/14 - 0s - loss: 0.0452 - accuracy: 0.9859 60 | Epoch 9/20 61 | 14/14 - 0s - loss: 0.0441 - accuracy: 0.9847 62 | Epoch 10/20 63 | 14/14 - 0s - loss: 0.0330 - accuracy: 0.9891 64 | Epoch 11/20 65 | 14/14 - 0s - loss: 0.0326 - accuracy: 0.9886 66 | Epoch 12/20 67 | 14/14 - 0s - loss: 0.0316 - accuracy: 0.9895 68 | Epoch 13/20 69 | 14/14 - 0s - loss: 0.0284 - accuracy: 0.9914 70 | Epoch 14/20 71 | 14/14 - 0s - loss: 0.0311 - accuracy: 0.9894 72 | Epoch 15/20 73 | 14/14 - 0s - loss: 0.0257 - accuracy: 0.9919 74 | Epoch 16/20 75 | 14/14 - 0s - loss: 0.0306 - accuracy: 0.9901 76 | Epoch 17/20 77 | 14/14 - 0s - loss: 0.0290 - accuracy: 0.9899 78 | Epoch 18/20 79 | 14/14 - 0s - loss: 0.0247 - accuracy: 0.9923 80 | Epoch 19/20 81 | 14/14 - 0s - loss: 0.0271 - accuracy: 0.9914 82 | Epoch 20/20 83 | 14/14 - 0s - loss: 0.0222 - accuracy: 0.9926 84 | Total time: 2.2868692874908447 second 85 | -------------------------------------------------------------------------------- /04_distributedLearning/submissions/qsub_polaris.sc: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #PBS -l walltime=00:30:00 3 | #PBS -l nodes=4:ppn=4 4 | #PBS -N atpesc_horovod 5 | #PBS -k doe 6 | #PBS -j oe 7 | #PBS -A ATPESC_2024 8 | 9 | module use /soft/modulefiles 10 | module load conda 11 | conda activate 12 | NODES=$(sort ${PBS_NODEFILE} | uniq -c | sort -n | wc -l) 13 | GPUS_PER_NODE=4 14 | RANKS=$((NODES * GPUS_PER_NODE)) 15 | echo NODES=$NODES PPN=$GPUS_PER_NODE RANKS=$RANKS 16 | 17 | aprun -n 1 -N 1 python Horovod/04_keras_cnn_concise_hvd.py 18 | aprun -n 2 -N 2 python Horovod/04_keras_cnn_concise_hvd.py 19 | aprun -n 4 -N 4 python Horovod/04_keras_cnn_concise_hvd.py 20 | aprun -n 8 -N 4 python Horovod/04_keras_cnn_concise_hvd.py 21 | aprun -n 16 -N 4 python Horovod/04_keras_cnn_concise_hvd.py 22 | -------------------------------------------------------------------------------- /04_distributedLearning/submissions/qsub_thetagpu.sc: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #COBALT -n 1 3 | #COBALT -t 0:15:00 4 | #COBALT -q training-gpu 5 | #COBALT -A ATPESC2023 6 | 7 | 8 | # source /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/setup.sh 9 | 10 | module load conda/2022-07-01; conda activate 11 | export http_proxy=http://theta-proxy.tmi.alcf.anl.gov:3128 12 | export https_proxy=https://theta-proxy.tmi.alcf.anl.gov:3128 13 | 14 | mpirun -np 1 python Horovod/04_keras_cnn_concise_hvd.py >& concise_1.out.gpu 15 | mpirun -np 2 python Horovod/04_keras_cnn_concise_hvd.py >& concise_2.out.gpu 16 | mpirun -np 4 python Horovod/04_keras_cnn_concise_hvd.py >& concise_4.out.gpu 17 | mpirun -np 8 python Horovod/04_keras_cnn_concise_hvd.py >& concise_8.out.gpu 18 | 19 | 20 | HOROVOD_TIMELINE=gpu.json LD_PRELOAD=/soft/perftools/hpctw/lib/libmpitrace.so mpirun -np 8 python Horovod/04_keras_cnn_concise_hvd.py 21 | HOROVOD_TIMELINE=cpu.json LD_PRELOAD=/soft/perftools/hpctw/lib/libmpitrace.so mpirun -np 8 python Horovod/04_keras_cnn_concise_hvd.py --device cpu 22 | 23 | -------------------------------------------------------------------------------- /05_aiTestbed/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/05_aiTestbed/.DS_Store -------------------------------------------------------------------------------- /05_aiTestbed/Cerebras/Cerebras_Wafer-Scale_Cluster_login_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/05_aiTestbed/Cerebras/Cerebras_Wafer-Scale_Cluster_login_diagram.png -------------------------------------------------------------------------------- /05_aiTestbed/Cerebras/README.md: -------------------------------------------------------------------------------- 1 | # Cerebras 2 | 3 | ## Connection to a CS-2 node 4 | 5 | Connection to one of the CS-2 cluster login nodes requires an MFA passcode for authentication - either an 8-digit passcode generated by an app on your mobile device (e.g. MobilePASS+) or a CRYPTOCard-generated passcode prefixed by a 4-digit pin. 6 | 7 | ![CS-2 connection diagram](./Cerebras_Wafer-Scale_Cluster_login_diagram.png) 8 | 9 | To connect to a CS-2 login, ssh to login nodes: 10 | ```bash 11 | ssh ALCFUserID@cerebras.ai.alcf.anl.gov 12 | ``` 13 | 14 | ## Create Virtual Environment 15 | 16 | ### PyTorch virtual environment 17 | Create a PyTorch virtual environment for Cerebras 18 | ```bash 19 | mkdir ~/R_2.3.0 20 | cd ~/R_2.3.0 21 | # Note: "deactivate" does not actually work in scripts. 22 | deactivate 23 | rm -r venv_cerebras_pt 24 | /software/cerebras/python3.8/bin/python3.8 -m venv venv_cerebras_pt 25 | source venv_cerebras_pt/bin/activate 26 | pip install --upgrade pip 27 | pip install cerebras_pytorch==2.3.0 28 | ``` 29 | ## Clone Cerebras modelzoo 30 | 31 | We use example from [Cerebras Modelzoo repository](https://github.com/Cerebras/modelzoo) for this hands-on. 32 | Clone the modezoo repository.
33 | 34 | ```bash 35 | mkdir ~/R_2.3.0 36 | cd ~/R_2.3.0 37 | git clone https://github.com/Cerebras/modelzoo.git 38 | cd modelzoo 39 | git tag 40 | git checkout Release_2.3.0 41 | ``` 42 | 43 | ## Job Queuing and Submission 44 | 45 | The CS-2 cluster has its own Kubernetes-based system for job submission and queuing. Jobs are started automatically through the Python scripts. 46 | 47 | Use Cerebras cluster command line tool to get addional information about the jobs. 48 | 49 | * Jobs that have not yet completed can be listed as 50 | `(venv_pt) $ csctl get jobs` 51 | * Jobs can be canceled as shown: 52 | `(venv_tf) $ csctl cancel job wsjob-eyjapwgnycahq9tus4w7id` 53 | 54 | See `csctl -h` for more options. 55 | 56 | ## Run Examples 57 | 58 | Refer to the instructions in the file below to run GPT-J model. 59 | 61 | * [GPT-J](./gpt-j.md) 62 | 63 | # Useful Resources 64 | 65 | * [ALCF Cerebras Documentation](https://docs.alcf.anl.gov/ai-testbed/cerebras/system-overview/) 66 | * [Cerebras Documntation](https://docs.cerebras.net/en/latest/wsc/index.html) 67 | * [Cerebras Modelzoo Repo](https://github.com/Cerebras/modelzoo/tree/main/modelzoo) 68 | * Datasets Path: `/software/cerebras/dataset` 69 | -------------------------------------------------------------------------------- /05_aiTestbed/Cerebras/gpt-j.md: -------------------------------------------------------------------------------- 1 | # Gpt-J on Cerebras 2 | 3 | Go to the directory with the GPT-J example. 4 | ```bash 5 | cd ~/R_2.3.0/modelzoo/src/cerebras/modelzoo/models/nlp/gptj 6 | ``` 7 | 8 | Activate PyTroch virtual Environment 9 | ```bash 10 | source ~/R_2.3.0/venv_cerebras_pt/bin/activate 11 | pip install -r ~/R_2.3.0/modelzoo/requirements.txt 12 | ``` 13 | 14 | Replace config file with correct configurations file. 15 | ```bash 16 | cp /software/cerebras/dataset/gptj/params_gptj_6B_sampleds.yaml configs/params_gptj_6B_sampleds.yaml 17 | ``` 18 | 19 | Run Training Job 20 | ```bash 21 | export MODEL_DIR=model_dir_gptj 22 | if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi 23 | python run.py CSX --job_labels name=gptj_pt --params configs/params_gptj_6B_sampleds.yaml --num_csx=2 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_2.3.0/modelzoo/src --compile_dir $(whoami) |& tee mytest.log 24 | ``` 25 |
26 | Sample Output (last section) 27 | 28 | ```bash 29 | 2023-11-29 20:59:19,223 INFO: Beginning appliance run 30 | 2023-11-29 21:03:53,875 INFO: | Train Device=CSX, Step=100, Loss=8.43750, Rate=43.70 samples/sec, GlobalRate=43.70 samples/sec 31 | 2023-11-29 21:08:28,779 INFO: | Train Device=CSX, Step=200, Loss=8.12500, Rate=43.67 samples/sec, GlobalRate=43.67 samples/sec 32 | 2023-11-29 21:08:28,781 INFO: Saving checkpoint at step 200 33 | 2023-11-29 21:13:56,695 INFO: Saved checkpoint model_dir_gptj/checkpoint_200.mdl 34 | 2023-11-29 21:14:30,135 INFO: Heartbeat thread stopped for wsjob-kd4olqkhu6ya8qqzt88utd. 35 | 2023-11-29 21:14:30,142 INFO: Training completed successfully! 36 | 2023-11-29 21:14:30,142 INFO: Processed 24000 sample(s) in 910.883781998 seconds. 37 | ``` 38 |
39 | -------------------------------------------------------------------------------- /05_aiTestbed/Graphcore/README.md: -------------------------------------------------------------------------------- 1 | # Graphcore 2 | 3 | ## Connection to Graphcore 4 | 5 | ![Graphcore connection diagram](./graphcore_login.png) 6 | 7 | Login to the Graphcore login node from your local machine. 8 | Once you are on the login node, ssh to one of the Graphcore nodes. 9 | 10 | ```bash 11 | local > ssh ALCFUserID@gc-login-01.ai.alcf.anl.gov 12 | # or 13 | local > ssh ALCFUserID@gc-login-02.ai.alcf.anl.gov 14 | ``` 15 | ```bash 16 | login-01.ai.aclf.anl.gov > ssh gc-poplar-02.ai.alcf.anl.gov 17 | # or 18 | login-01.ai.aclf.anl.gov > ssh gc-poplar-03.ai.alcf.anl.gov 19 | # or 20 | login-01.ai.aclf.anl.gov > ssh gc-poplar-04.ai.alcf.anl.gov 21 | ``` 22 | 23 | ## Create Virtual Environment 24 | 25 | ### PyTorch virtual environment 26 | 27 | ```bash 28 | mkdir -p ~/venvs/graphcore 29 | virtualenv ~/venvs/graphcore/poptorch33_env 30 | source ~/venvs/graphcore/poptorch33_env/bin/activate 31 | 32 | POPLAR_SDK_ROOT=/software/graphcore/poplar_sdk/3.3.0 33 | export POPLAR_SDK_ROOT=$POPLAR_SDK_ROOT 34 | pip install $POPLAR_SDK_ROOT/poptorch-3.3.0+113432_960e9c294b_ubuntu_20_04-cp38-cp38-linux_x86_64.whl 35 | ``` 36 | 37 | ### Tensorflow virtual environment 38 | 39 | ```bash 40 | virtualenv ~/venvs/graphcore/tensorflow2_33_env 41 | source ~/venvs/graphcore/tensorflow2_33_env/bin/activate 42 | 43 | POPLAR_SDK_ROOT=/software/graphcore/poplar_sdk/3.3.0 44 | export POPLAR_SDK_ROOT=$POPLAR_SDK_ROOT 45 | pip install $POPLAR_SDK_ROOT/tensorflow-2.6.3+gc3.3.0+251580+08d96978c7f+amd_znver1-cp38-cp38-linux_x86_64.whl 46 | pip install $POPLAR_SDK_ROOT/keras-2.6.0+gc3.3.0+251582+a3785372-py2.py3-none-any.whl 47 | ``` 48 | ## Miscellaneous Environment Variables 49 | ```bash 50 | mkdir ~/tmp 51 | export TF_POPLAR_FLAGS=--executable_cache_path=~/tmp 52 | export POPTORCH_CACHE_DIR=~/tmp 53 | 54 | export POPART_LOG_LEVEL=WARN 55 | export POPLAR_LOG_LEVEL=WARN 56 | export POPLIBS_LOG_LEVEL=WARN 57 | 58 | export PYTHONPATH=/software/graphcore/poplar_sdk/3.3.0/poplar-ubuntu_20_04-3.3.0+7857-b67b751185/python:$PYTHONPATH 59 | ``` 60 | ## Clone Graphcore Examples 61 | 62 | We use examples from [Graphcore Examples repository](https://github.com/graphcore/examples) for this hands-on. 63 | Clone the Graphcore Examples repository. 64 | ```bash 65 | mkdir ~/graphcore 66 | cd ~/graphcore 67 | git clone https://github.com/graphcore/examples.git 68 | cd examples 69 | git tag 70 | git checkout v3.3.0 71 | ``` 72 | 73 | ## Job Queuing and Submission 74 | 75 | ALCF's Graphcore POD64 system uses Slurm for job submission and queueing. Below are some of the important commands for using Slurm. 76 | 77 | * `srun` : The Slurm command `srun` can be used to run individual Python scripts. Use the --ipus= option to specify the number of IPUs required for the run. 78 | e.g. `srun --ipus=1 python mnist_poptorch.py` 79 | * `sbatch` : The jobs can be submitted to the Slurm workload manager through a batch script by using the `sbatch` command 80 | * `squeue` : command provides information about jobs located in the Slurm scheduling queue. 81 | * `sCancel` : is used to signal or cancel jobs, job arrays, or job steps. 82 | 83 | ## Run Examples 84 | 85 | Refer to respective instrcutions below 86 | 88 | * [GPT2](./gpt2.md) 89 | 90 | ```bash 91 | Note: Precompiled artifacts are present at the /software/graphcore/projects/models_compile location for the above models. 92 | copy them to your ~/tmp and set export POPTORCH_CACHE_DIR=~/tmp to skip the compile process. 93 | ``` 94 | ## Profiling 95 | 96 | We will use Pop Vision Graph Analyzer and System Analyzer to produce profiles.  97 | 98 | * [PopVision Graph Analyzer User Guide](https://docs.graphcore.ai/projects/graph-analyser-userguide/en/latest/) 99 | * [PopVision System Analyzer User Guide](https://docs.graphcore.ai/projects/system-analyser-userguide/en/latest/) 100 | * [PopVision Tools Downloads](https://www.graphcore.ai/developer/popvision-tools#downloads) 101 | 102 | #### PopVision Graph Analyzer 103 | 104 | To generate a profile for PopVision Graph Analyzer, run the executable with the following prefix 105 | 106 | ```bash 107 | $ POPLAR_ENGINE_OPTIONS='{"autoReport.all":"true", "autoReport.directory":"./graph_profile", "profiler.includeFlopEstimates": "true"}' python mnist_poptorch.py 108 | ``` 109 | 110 | This will generate all the graph profiling reports along with flops estimates and save the output to the graph_profile directory. 111 | 112 | To visualize the profiles, download generated profiles to a local machine and open them using PopVision Graph Analyzer.  113 | 114 | #### PopVision System Analyzer 115 | 116 | To generate a profile for PopVision System Analyzer, run the executable with the following prefix 117 | 118 | ```bash 119 | $ PVTI_OPTIONS='{"enable":"true", "directory": "./system_profile"}' python mnist_poptorch.py 120 | ``` 121 | This will generate all the system profiling reports and save the output to system_profile directory. 122 | 123 | To visualize the profiles, download generated profiles to a local machine and open them using PopVision Graph Analyzer.  124 | 125 | ## Useful Resources 126 | 127 | * [ALCF Graphcore Documentation](https://docs.alcf.anl.gov/ai-testbed/graphcore/system-overview/) 128 | * [Graphcore Documentation](https://docs.graphcore.ai/en/latest/) 129 | * [Graphcore Examples Repository](https://github.com/graphcore/examples) 130 | * Graphcore SDK Path: `/software/graphcore/poplar_sdk` 131 | -------------------------------------------------------------------------------- /05_aiTestbed/Graphcore/gpt2.md: -------------------------------------------------------------------------------- 1 | # GPT2 on Graphcore 2 | 3 | These instructions are to train a GPT-2 pytorch model on the POD16. 4 | 5 | ##### Go to direcotry with GPT2 example 6 | ```bash 7 | cd ~/graphcore/examples/nlp/gpt2/pytorch 8 | ``` 9 | 10 | ##### Create a new PopTorch Environment 11 | ```bash 12 | POPLAR_SDK_ROOT=/software/graphcore/poplar_sdk/3.3.0/ 13 | export POPLAR_SDK_ROOT=$POPLAR_SDK_ROOT 14 | 15 | virtualenv ~/venvs/graphcore/poptorch33_gpt2 16 | source ~/venvs/graphcore/poptorch33_gpt2/bin/activate 17 | pip install $POPLAR_SDK_ROOT/poptorch-3.3.0+113432_960e9c294b_ubuntu_20_04-cp38-cp38-linux_x86_64.whl 18 | export PYTHONPATH=$POPLAR_SDK_ROOT/python:$PYTHONPATH 19 | ``` 20 | 21 | ##### Install Requirements 22 | 23 | ```bash 24 | pip3 install -r requirements.txt 25 | ``` 26 | 27 | ##### Run GPT2 on 4 IPUs (single Instance) 28 | 29 | * Compile and Run from scratch 30 | ```bash 31 | /opt/slurm/bin/srun --ipus=4 python /home/$USER/graphcore/examples/nlp/gpt2/pytorch/train_gpt2.py --model gpt2 --ipus-per-replica 4 --replication-factor 1 --gradient-accumulation 2048 --device-iterations 8 --batch-size 1 --layers-per-ipu 0 4 4 4 --matmul-proportion 0.15 0.15 0.15 0.15 --max-len 1024 --optimizer AdamW --learning-rate 0.00015 --lr-schedule cosine --lr-warmup 0.01 --remap-logit True --enable-sequence-serialized True --embedding-serialization-factor 4 --recompute-checkpoint-every-layer True --enable-half-partials True --replicated-tensor-sharding True --dataset 'generated' --epochs 1 32 | ``` 33 | * Run from Precompiled Artifacts 34 | ```bash 35 | Note: Precompiled artifacts are present at the /software/graphcore/projects/models_compile location for the above models. 36 | copy them to your ~/tmp and set export POPTORCH_CACHE_DIR=~/tmp to skip the compile process. 37 | ``` 38 | 39 | ##### Run GPT2 on 16 IPUs (4 Instances) 40 | ```bash 41 | /opt/slurm/bin/srun --ipus=16 python /home/$USER/graphcore/examples/nlp/gpt2/pytorch/train_gpt2.py --model gpt2 --ipus-per-replica 4 --replication-factor 4 --gradient-accumulation 2048 --device-iterations 8 --batch-size 1 --layers-per-ipu 0 4 4 4 --matmul-proportion 0.15 0.15 0.15 0.15 --max-len 1024 --optimizer AdamW --learning-rate 0.00015 --lr-schedule cosine --lr-warmup 0.01 --remap-logit True --enable-sequence-serialized True --embedding-serialization-factor 4 --recompute-checkpoint-every-layer True --enable-half-partials True --replicated-tensor-sharding True --dataset 'generated' --epochs 1 42 | ``` 43 |
44 | Sample Output 45 | 46 | ```bash 47 | srun: job 10697 queued and waiting for resources 48 | srun: job 10697 has been allocated resources 49 | Building (if necessary) and loading remap_tensor_ce. 50 | Failed to find compiled extension; rebuilding. 51 | Building (if necessary) and loading residual_add_inplace_pattern. 52 | Model initializing 53 | -------------------- Device Allocation -------------------- 54 | Embedding --> IPU 0 55 | Layer 0 --> IPU 1 56 | Layer 1 --> IPU 1 57 | Layer 2 --> IPU 1 58 | Layer 3 --> IPU 1 59 | Layer 4 --> IPU 2 60 | Layer 5 --> IPU 2 61 | Layer 6 --> IPU 2 62 | Layer 7 --> IPU 2 63 | Layer 8 --> IPU 3 64 | Layer 9 --> IPU 3 65 | Layer 10 --> IPU 3 66 | Layer 11 --> IPU 3 67 | LM_head --> IPU 0 68 | 69 | step 0 of epoch 0, loss: 10.913220405578613, acc: 2.0071864128112793e-05, lr: 0.00012803300858899104, throughput: 646.8439205981404 samples/sec 70 | step 1 of epoch 0, loss: 10.836345672607422, acc: 1.9788742065429688e-05, lr: 7.5e-05, throughput: 1058.0979097185766 samples/sec 71 | step 2 of epoch 0, loss: 10.831247329711914, acc: 2.0518898963928223e-05, lr: 2.1966991411008938e-05, throughput: 1058.7595523807183 samples/sec 72 | step 3 of epoch 0, loss: 10.829034805297852, acc: 1.990795135498047e-05, lr: 0.0, throughput: 1059.6762623043378 samples/sec 73 | ``` 74 |
75 | 76 | -------------------------------------------------------------------------------- /05_aiTestbed/Graphcore/graphcore_login.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/05_aiTestbed/Graphcore/graphcore_login.png -------------------------------------------------------------------------------- /05_aiTestbed/Groq/README.md: -------------------------------------------------------------------------------- 1 | # Groq 2 | 3 | ## Connection to Groq 4 | 5 | ![Groq connection diagram](./groqrack_system_diagram.png) 6 | 7 | Login to the Groq login node from your local machine. 8 | Once you are on the login node, ssh to one of the Groq nodes. 9 | 10 | ```bash 11 | local > ssh ALCFUserID@groq.ai.alcf.anl.gov 12 | ``` 13 | ```bash 14 | groq-login > ssh groq-r01-gn-01.ai.alcf.anl.gov 15 | # or 16 | groq-login > ssh groq-r01-gn-09.ai.alcf.anl.gov 17 | # or any node with hostname of form groq-r01-gn-0[1-9].ai.alcf.anl.gov 18 | ``` 19 | 20 | ## Create Virtual Environment 21 | 22 | ### Install Miniconda 23 | 24 | ```bash 25 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh 26 | bash Miniconda3-latest-Linux-x86_64.sh 27 | ``` 28 | 29 | ### PyTorch virtual environment 30 | 31 | ```bash 32 | export PYTHON_VERSION=3.10.12 33 | conda create -n groqflow python=$PYTHON_VERSION -y 34 | conda activate groqflow 35 | ``` 36 | 37 | ### Install Groqflow 38 | 39 | ```bash 40 | # Alter this if you have cloned groqflow to some other location. 41 | git clone https://github.com/groq/groqflow.git 42 | cd ~/groqflow 43 | if [ -d "groqflow.egg-info" ]; then rm -r groqflow.egg-info; fi 44 | pip install --upgrade pip 45 | pip list --format=freeze > frozen.txt 46 | pip install -r frozen.txt -e . 47 | pushd . 48 | cd demo_helpers 49 | if [ -d "groqflow_demo_helpers.egg-info" ]; then rm -r groqflow_demo_helpers.egg-info; fi 50 | pip install -e . 51 | popd 52 | pip install soundfile 53 | ``` 54 | 55 | 56 | ## Job Queuing and Submission 57 | 58 | Groq jobs in the AI Testbed's groqrack are managed by the PBS job scheduler. 59 | 60 | * `qsub` : to submit a batch job using a script 61 | * `qstat`: to display queue information 62 | * `qdel`: to delete (cancel) a job: 63 | * `qhold`: to hold a job 64 | 65 | ### Schedule batch Job 66 | 67 |
68 | Create a sample run_minilmv2.sh script as below 69 | 70 | #!/bin/bash 71 | # >>> conda initialize >>> 72 | # !! Contents within this block are managed by 'conda init' !! 73 | __conda_setup="$(${HOME}'/miniconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" 74 | if [ $? -eq 0 ]; then 75 | eval "$__conda_setup" 76 | else 77 | if [ -f "${HOME}/miniconda3/etc/profile.d/conda.sh" ]; then 78 | . "${HOME}/miniconda3/etc/profile.d/conda.sh" 79 | else 80 | export PATH="${HOME}/miniconda3/bin:$PATH" 81 | fi 82 | fi 83 | unset __conda_setup 84 | # <<< conda initialize <<< 85 | conda activate groqflow 86 | cd ~/groqflow/proof_points/natural_language_processing/minilm 87 | pip install -r requirements.txt 88 | python minilmv2.py 89 | 90 |
91 | 92 | Ensure you have a groqflow conda environment activated. 93 | ```bash 94 | conda activate groqflow 95 | ``` 96 | 97 | Then run the script as a batch job with PBS. This will reserve a full eight-card(chip) node. 98 | ```bash 99 | qsub -l select=1,place=excl run_minilmv2.sh 100 | ``` 101 | 102 | 103 | ### Schedule Interactive Job 104 | 105 | Following command gives a single Groq node interactively for 1 hour 106 | ```bash 107 | qsub -I -l walltime=1:00:00 108 | ``` 109 | Other flags that can be used 110 | ```bash 111 | -l ncpus=1 112 | -l groq_accelerator=1 113 | ``` 114 | 115 | Then activate your groqflow environment, clone the repo and run python scripts with 116 | ```bash 117 | conda activate groqflow 118 | cd ~/groqflow/proof_points/natural_language_processing/minilm 119 | pip install -r requirements.txt 120 | python minilmv2.py 121 | ``` 122 | 127 | 128 | 129 | ## Useful Resources 130 | 131 | * [ALCF Groq Documenation](https://docs.alcf.anl.gov/ai-testbed/groq/system-overview/) 132 | * [Groq Documentation](https://support.groq.com/#/login) 133 | * [Groq Examples Repository](https://github.com/groq/groqflow/tree/main/proof_points) 134 | 135 | -------------------------------------------------------------------------------- /05_aiTestbed/Groq/groqrack_system_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/05_aiTestbed/Groq/groqrack_system_diagram.png -------------------------------------------------------------------------------- /05_aiTestbed/README.md: -------------------------------------------------------------------------------- 1 | # Introduction to AI Testbeds at ALCF and hands 2 | 3 | Please refer to [Slides here](./AI%20Testbeds%20Hands-on%20ATPESC2023.pdf) and [online documentation](https://docs.alcf.anl.gov/ai-testbed/getting-started/) -------------------------------------------------------------------------------- /05_aiTestbed/SambaNova/README.md: -------------------------------------------------------------------------------- 1 | # Sambanova 2 | 3 | ## Connection to Sambanova 4 | 5 | Connection to a SambaNova node is a two-step process. The first step is to `ssh` to the `login node`. The second step is to log in to a SambaNova node from the `login node`. 6 | 7 | ![Sambanova connection diagram](./sambanova_login.jpg) 8 | 9 | Login to the Sambanova login node from your local machine. This uses the **MobilePASS+** token generated every time you log in to the system. 10 | 11 | In the examples below, replace ALCFUserID with your ALCF user id. 12 | ```bash 13 | ssh ALCFUserID@sambanova.alcf.anl.gov 14 | Password: < MobilePASS+ code > 15 | ``` 16 | 17 | Note: Use the ssh "-v" option in order to debug any ssh problems. 18 | 19 | Once you are on the login node, ssh to one of the sambanova nodes. 20 | ```bash 21 | ssh sn30-r1-h1 22 | ``` 23 | 24 | You can also ssh to `sn30-r1-h1` , `sn30-r1-h2`, `sn30-r2-h1`, `sn30-r2-h2`, `sn30-r3-h1`, `sn30-r3-h2`, `sn30-r4-h1`, `sn30-r4-h2`. 25 | 26 | ## Pre-Built Sample Venv 27 | 28 | Sambanova software stack and associated environmental variables are automatically setup at login for a SN30 node. 29 | 30 | Each of the samples or application examples provided by SambaNova has its own pre-built virtual environment which can be readily used. They are present in the `/opt/sambaflow/apps/` directory tree within each of the applications. 31 | 32 | 33 | ## Run Examples 34 | 35 | Refer to respective instructions below. 36 | 37 | Main examples: 38 | * [GPT 1.5B](./gpt15b.md) 39 | 40 | 41 | ## Additional Resources 42 | 43 | * [ALCF Sambanova Documentation](https://docs.alcf.anl.gov/ai-testbed/sambanova/getting-started/) 44 | * [Sambanova Documentation](https://docs.sambanova.ai/developer/latest/sambaflow-intro.html) 45 | * Sambanova applications path: `/opt/sambaflow/apps/` 46 | * Sambanova model scripts: `/data/ANL/scripts/` 47 | * Important datasets: `/software/sambanova/dataset/` 48 | -------------------------------------------------------------------------------- /05_aiTestbed/SambaNova/gpt15b.md: -------------------------------------------------------------------------------- 1 | # GPT 1.5B Parameter on Sambanova 2 | 3 | ##### Create and and move to the following directory. 4 | 5 | ```bash 6 | mkdir -p ~/apps/nlp/Gpt1.5B_single 7 | cd ~/apps/nlp/Gpt1.5B_single 8 | ``` 9 | 10 | ##### Copy script to Compile and Run 11 | 12 | ```bash 13 | cp /data/ANL/scripts/Gpt1.5B_base_single_compile.sh . 14 | cp /data/ANL/scripts/Gpt1.5B_base_single_run.sh . 15 | 16 | chmod +x Gpt1.5B_base_single_compile.sh 17 | chmod +x Gpt1.5B_base_single_run.sh 18 | ``` 19 | 20 | ##### Run the script to Compile and Run 21 | 22 | ```bash 23 | ./Gpt1.5B_base_single_compile.sh 32 24 | ``` 25 | For more information refer to [Gpt1.5B](https://docs.alcf.anl.gov/ai-testbed/sambanova/example-programs/#gpt-15b). 26 | 27 | Precompiled artifacts are available at : /data/scratch/preCompiled/ directory that can be accessed by any compute node. 28 | Change the OUTDIR path as below in the above scripts 29 | ```bash 30 | OUTDIR=/data/scratch/preCompiled/${MODEL_NAME} 31 | ``` 32 | -------------------------------------------------------------------------------- /05_aiTestbed/SambaNova/sambanova_login.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/05_aiTestbed/SambaNova/sambanova_login.jpg -------------------------------------------------------------------------------- /05_aiTestbed/Slides/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/05_aiTestbed/Slides/.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [ATPESC 2024](https://extremecomputingtraining.anl.gov/agenda-2024/) 2 | 3 | At the beginning of the day, we will temporarily split into two groups. Attendees can choose between "Introduction to deep learning" (01_deepLearning) and "Building data pipelines" (02_dataPipelines). 4 | 5 | The "Introduction to deep learning" session will rely on Jupyter Notebooks which are targeted for running on [Google's Colaboratory Platform](https://colab.research.google.com) or [ALCF JupyterHub](https://www.alcf.anl.gov/support-center/theta/jupyter-hub). The Colab platform gives the user a virtual machine in which to run Python codes including machine learning codes. The VM comes with a preinstalled environment that includes most of what is needed for these tutorials. 6 | 7 | The other sessions involve Python scripts executed on the [Polaris](https://docs.alcf.anl.gov/polaris/getting-started/) and [AI Testbed](https://www.alcf.anl.gov/alcf-ai-testbed) platforms at ALCF. 8 | 9 | ## Using Google Colab 10 | Google Colab involves running Jupyter notebooks, which you have experience with from earlier in the week. 11 | 12 | Do the following before you come to the tutorial: 13 | * You need a Google Account to use Colaboratory 14 | * Go to [Google's Colaboratory Platform](https://colab.research.google.com) 15 | * You should see this page 16 | ![start_page](README_imgs/colab_start_page.png) 17 | * Now you can open the `File` menu at the top left and select `Open Notebook` which will open a dialogue box. 18 | * Select the `GitHub` tab in the dialogue box. 19 | * From here you can enter the url for the github repo: `https://github.com/argonne-lcf/ATPESC_MachineLearning` and hit ``. 20 | ![open_github](README_imgs/colab_open_github.png) 21 | * This will show you a list of the Notebooks available in the repo. When you select a notebook from this list it will create a copy for you in your Colaboratory account (all `*.ipynb` files in the Colaboratory account will be stored in your Google Drive). 22 | * To use a GPU in the notbook select `Runtime` -> `Change Runtime Type` and select an accelerator. 23 | -------------------------------------------------------------------------------- /README_imgs/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/README_imgs/.DS_Store -------------------------------------------------------------------------------- /README_imgs/colab_clean_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/README_imgs/colab_clean_page.png -------------------------------------------------------------------------------- /README_imgs/colab_open_github.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/README_imgs/colab_open_github.png -------------------------------------------------------------------------------- /README_imgs/colab_start_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/README_imgs/colab_start_page.png -------------------------------------------------------------------------------- /README_imgs/collab_start_page1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/README_imgs/collab_start_page1.png -------------------------------------------------------------------------------- /extra_statisticalLearning/README.md: -------------------------------------------------------------------------------- 1 | # Overview of Statistical Learning Methods 2 | Authors: [Sam Foreman](https://www.samforeman.me)[^1] ([foremans@anl.gov](mailto:///foremans@anl.gov)), Taylor Childers (jchilders@anl.gov), and Romit Maulik (rmaulik@anl.gov) 3 | 4 | - 📕 [Statistical Learning](./src/atpesc/notebooks/statistical_learning.ipynb) 5 | - 📊 [slides](https://saforem2.github.io/ATPESC-StatisticalLearning) 6 | 7 | To install, 8 | 9 | ```bash 10 | git clone https://github.com/argonne-lcf/ATPESC_MachineLearning 11 | cd ATPESC_MachineLearning/00_statisticalLearning 12 | python3 -m pip install -e . 13 | ``` 14 | 15 | and launch [notebook](./src/atpesc/notebooks/statistical_learning.ipynb) 16 | 17 | [^1]: Presenter 18 | -------------------------------------------------------------------------------- /extra_statisticalLearning/assets/pngs/atpesc-k-means-step1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/assets/pngs/atpesc-k-means-step1.png -------------------------------------------------------------------------------- /extra_statisticalLearning/assets/pngs/atpesc-k-means-step2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/assets/pngs/atpesc-k-means-step2.png -------------------------------------------------------------------------------- /extra_statisticalLearning/assets/pngs/atpesc-k-means-step3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/assets/pngs/atpesc-k-means-step3.png -------------------------------------------------------------------------------- /extra_statisticalLearning/assets/pngs/atpesc-k-means-step4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/assets/pngs/atpesc-k-means-step4.png -------------------------------------------------------------------------------- /extra_statisticalLearning/assets/pngs/learning-rate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/assets/pngs/learning-rate.png -------------------------------------------------------------------------------- /extra_statisticalLearning/images/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/images/image.png -------------------------------------------------------------------------------- /extra_statisticalLearning/images/kmeans.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/images/kmeans.png -------------------------------------------------------------------------------- /extra_statisticalLearning/images/mse.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/images/mse.pdf -------------------------------------------------------------------------------- /extra_statisticalLearning/images/mse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/images/mse.png -------------------------------------------------------------------------------- /extra_statisticalLearning/images/sgd.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/images/sgd.pdf -------------------------------------------------------------------------------- /extra_statisticalLearning/images/sgd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/images/sgd.png -------------------------------------------------------------------------------- /extra_statisticalLearning/images/sgd_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/images/sgd_example.png -------------------------------------------------------------------------------- /extra_statisticalLearning/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | -------------------------------------------------------------------------------- /extra_statisticalLearning/setup.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/setup.cfg -------------------------------------------------------------------------------- /extra_statisticalLearning/src/atpesc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/src/atpesc/__init__.py -------------------------------------------------------------------------------- /extra_statisticalLearning/src/atpesc/common.py: -------------------------------------------------------------------------------- 1 | """ 2 | common.py 3 | 4 | Contains helper functions. 5 | """ 6 | from __future__ import absolute_import, division, print_function, annotations 7 | from typing import Any, Optional 8 | import logging 9 | 10 | import numpy as np 11 | from sklearn import datasets 12 | from pathlib import Path 13 | from sklearn.preprocessing import StandardScaler 14 | from sklearn.cluster import KMeans 15 | 16 | from atpesc.utils.plots import plot_kmeans_obj 17 | 18 | SEED = 1234 19 | DEFAULT_LR = float(1e-6) 20 | 21 | HERE = Path(__file__).parent 22 | SECTION_DIR = HERE.parent.parent 23 | DATA_DIR = HERE.parent.parent.joinpath('data') 24 | 25 | Array = np.ndarray 26 | 27 | 28 | log = logging.getLogger(__name__) 29 | 30 | 31 | def predict_price(slope: Any, input_area: Any): 32 | return slope * input_area 33 | 34 | 35 | def evaluate(slope, input_area, true_price) -> np.floating: 36 | price_prediction = predict_price(slope, input_area) 37 | return np.mean((true_price - price_prediction) ** 2) 38 | 39 | 40 | def learn( 41 | input_area: float, 42 | input_price: float, 43 | input_slope: float, 44 | learning_rate: Optional[float] = None 45 | ) -> np.floating: 46 | learning_rate = DEFAULT_LR if learning_rate is None else learning_rate 47 | # ------------------------------------ 48 | # 1. First compute: df/dx, where: 49 | # f = predict_price 50 | # x = input_size 51 | # ------------------------------------ 52 | tmp = (2. * input_area) * ( 53 | predict_price(input_slope, input_area) - input_price 54 | ) 55 | dfdx = np.mean(tmp) 56 | # -------------------------------------------- 57 | # 2. Update the slope via SGD Update step 58 | # -------------------------------------------- 59 | return input_slope - learning_rate * dfdx 60 | 61 | 62 | def load_cancer_data() -> tuple[Array, Array]: 63 | """Returns cancer dataset (unscaled).""" 64 | from sklearn import datasets 65 | import pandas as pd 66 | data = datasets.load_breast_cancer() 67 | assert isinstance(data, dict) 68 | x = pd.DataFrame( 69 | data['data'], 70 | columns=data['feature_names'] 71 | ) 72 | x = x[sorted(x.columns)] 73 | y = data['target'] 74 | 75 | return x, y 76 | 77 | 78 | def create_run_blob_kmeans( 79 | nsamples: int, 80 | nfeatures: int, 81 | nclusters: int 82 | ) -> None: 83 | x, _ = datasets.make_blobs( 84 | n_samples=nsamples, 85 | n_features=nfeatures, 86 | centers=nclusters, 87 | random_state=SEED 88 | ) 89 | 90 | # Normalize features 91 | x_ = StandardScaler().fit_transform(x) 92 | model = KMeans( 93 | n_clusters=nclusters, 94 | init='k-means++', 95 | n_init=10, 96 | max_iter=300, 97 | random_state=SEED, 98 | # n_jobs=4, 99 | ) 100 | model.fit(x_) 101 | plot_kmeans_obj(x_, 20) 102 | -------------------------------------------------------------------------------- /introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Introductory Material\n", 8 | "\n", 9 | "Run this introductory IPython Notebook into your [Colaborotory](https://colab.research.google.com).\n", 10 | "\n", 11 | "This is a 'Markdown' or 'Text' cell where notes can be made to help you remember what you were doing." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# This is a code cell where you can run Python code on the local machine\n", 21 | "import tensorflow as tf\n", 22 | "print(f'tensorflow version: {tf.__version__}')\n", 23 | "# when you're done typing you hit + and the cell is executed in the Python Interpreter" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "You can also run bash shell commands using `!`" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "!echo Hello World" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "!ls" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "Let's checkout the ATPESC_MachineLearning repo to our local Colaboratory Virtual Machine:" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "!git clone https://github.com/argonne-lcf/ATPESC_MachineLearning.git repo" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "!ls repo" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "# Notebook Runtimes\n", 81 | "While running a notebook, you must consider that the Python Interpreter is running in the background and will retain all previous executed codes. Mean variables persist until the runtime is killed or reset. This can be done from the 'Runtime' menu above. \n", 82 | "\n", 83 | "Let's see how this works." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "x = 42\n", 93 | "print(x)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "Now reset/restart the runtime via the menu and run the following:" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "# This will cause 'NameError' exception since the variable does not exist.\n", 110 | "print(x)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "One Colaboratory there is another level of restart/reset called ' Factory reset runtimes ' which not only restarts the Python interpreter, but reboots your VM causing all local files to be lost. For example, run the following command:" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "# list the repo we cloned from git\n", 127 | "!ls repo" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "Now run 'Factory reset runtimes' from the menu and then run this command" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "# this will throw an error since there is no longer a file\n", 144 | "!ls repo" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "# recheck out the repo, we want to use this later in our jobs\n", 154 | "!git clone https://github.com/argonne-lcf/ATPESC_MachineLearning.git repo" 155 | ] 156 | } 157 | ], 158 | "metadata": { 159 | "kernelspec": { 160 | "display_name": "Python 3", 161 | "language": "python", 162 | "name": "python3" 163 | }, 164 | "language_info": { 165 | "codemirror_mode": { 166 | "name": "ipython", 167 | "version": 3 168 | }, 169 | "file_extension": ".py", 170 | "mimetype": "text/x-python", 171 | "name": "python", 172 | "nbconvert_exporter": "python", 173 | "pygments_lexer": "ipython3", 174 | "version": "3.7.4" 175 | } 176 | }, 177 | "nbformat": 4, 178 | "nbformat_minor": 2 179 | } 180 | --------------------------------------------------------------------------------