├── .DS_Store
├── .gitignore
├── 01_deepLearning
    ├── .DS_Store
    ├── 01_introduction_mnist.ipynb
    ├── 02_conv_networks.ipynb
    ├── README.md
    └── images
    │   ├── LinearModel_1.png
    │   ├── MaxpoolSample2.png
    │   ├── MnistExamples.png
    │   ├── Padding.png
    │   ├── ResNet.png
    │   ├── U-Nets.png
    │   ├── ViT.gif
    │   ├── _placeholder
    │   ├── activation.jpeg
    │   ├── activation_functions.png
    │   ├── bias_vs_variance.png
    │   ├── conv.png
    │   ├── convNN_BlockDiagram.jpeg
    │   ├── conv_layer.png
    │   ├── convnets-feature-maps.png
    │   ├── deep_nn.png
    │   ├── dropout.png
    │   ├── learning-rate-gradient-descent.jpeg
    │   ├── lr.jpeg
    │   ├── mnist_task.png
    │   ├── multiple_channels.png
    │   ├── nn1.pdf
    │   ├── nn1.png
    │   ├── nonconvex.png
    │   ├── one_more_layer.png
    │   ├── optimization_types.png
    │   ├── shallow_nn.png
    │   ├── test_data_rule.png
    │   ├── three_layer_network.png
    │   └── tiny_network.png
├── 02_dataPipelines
    ├── 00_tensorflowDatasetAPI
    │   ├── README.md
    │   ├── ilsvrc.json
    │   ├── ilsvrc_dataset.py
    │   ├── logdir
    │   │   └── .gitignore
    │   └── submit_polaris.sh
    ├── 01_pytorchDatasetAPI
    │   ├── README.md
    │   ├── imagenet_parallel.py
    │   ├── imagenet_serial.py
    │   ├── logdir
    │   │   └── .gitignore
    │   └── submit_polaris.sh
    ├── README.md
    └── images
    │   ├── ilsvrc_64threads.png
    │   ├── ilsvrc_64threads_zoom.png
    │   ├── ilsvrc_parallel.png
    │   ├── ilsvrc_serial.png
    │   ├── ilsvrc_serial_zoom.png
    │   ├── n01667778_12001.JPEG
    │   ├── n02094114_1205.JPEG
    │   └── pytorch_threading.png
├── 03_introlangmodels
    ├── 03_languagemodels.ipynb
    ├── 03_languagemodels_colab.ipynb
    ├── README.md
    ├── dataset
    │   ├── cached_lm_GPT2TokenizerFast_128_input.txt
    │   ├── cached_lm_GPT2TokenizerFast_128_input.txt.lock
    │   ├── cached_lm_GPT2TokenizerFast_128_test_input.txt
    │   ├── cached_lm_GPT2TokenizerFast_128_test_input.txt.lock
    │   ├── cached_lm_GPT2TokenizerFast_128_train_input.txt
    │   ├── cached_lm_GPT2TokenizerFast_128_train_input.txt.lock
    │   ├── input.txt
    │   ├── test_input.txt
    │   └── train_input.txt
    ├── images
    │   ├── Attention_Vis.png
    │   ├── BERT_Explanation.webp
    │   ├── BERT_input_sent.webp
    │   ├── Byte_Pair_enc.webp
    │   ├── Graphformer.png
    │   ├── LLM_Architectures.webp
    │   ├── LLM_Blackbox.png
    │   ├── LLM_Theoret_Apps.pptx
    │   ├── Lorem.png
    │   ├── Postitional_Embedding.webp
    │   ├── Protein-Structure-06.png
    │   ├── RNA-codons.svg.png
    │   ├── The_transformer_encoder_decoder_stack.png
    │   ├── The_transformer_encoders_decoders.png
    │   ├── Transformer_Arch.png
    │   ├── Transformer_Enc_Dec_Blocks.png
    │   ├── Transformer_decoder.png
    │   ├── WordPieceTok.webp
    │   ├── attention_is_all_you_need.png
    │   ├── chars-tokenization.png
    │   ├── decoder_only_block.png
    │   ├── en_chapter1_transformers_chrono.svg
    │   ├── encode_decode.png
    │   ├── encoder_with_tensors_2.png
    │   ├── genslm.png
    │   ├── gpt-2-layers-2.png
    │   ├── gpt2-output.png
    │   ├── gpt2-self-attention-example-2.png
    │   ├── one-hot-vocabulary-example.png
    │   ├── output_target_probability_distributions.png
    │   ├── positional_encoding.png
    │   ├── recurrent_nn.png
    │   ├── rnn.png
    │   ├── self-attention-and-masked-self-attention.png
    │   ├── self-attention-example-folders-3.png
    │   ├── self-attention-output.png
    │   ├── text-processing---machines-vs-humans.png
    │   ├── the_transformer_3.png
    │   ├── tokenize.png
    │   ├── tokenize_words.png
    │   ├── transformer-decoder-intro.png
    │   ├── transformer_attention_heads_z.png
    │   ├── transformer_decoder_output_softmax (1).png
    │   ├── transformer_decoder_output_softmax.png
    │   ├── transformer_decoding_1.gif
    │   ├── transformer_decoding_2.gif
    │   ├── transformer_logits_output_and_label.png
    │   ├── transformer_multi-headed_self-attention-recap.png
    │   ├── transformer_positional_encoding_vectors.png
    │   ├── transformer_self-attention_visualization.png
    │   ├── transformer_self-attention_visualization_3.png
    │   ├── vision-transformer-vit.png
    │   ├── viz-bert-voc-verbs.png
    │   └── wordembedding.png
    └── vocab.txt
├── 03_profileLearning
    ├── .gitignore
    ├── README.md
    ├── line_profiler
    │   ├── README.md
    │   ├── train_MNIST.py
    │   └── train_MNIST_iofix.py
    ├── reduced_precision
    │   ├── README.md
    │   ├── images
    │   │   ├── kernel-stats.png
    │   │   ├── profiler_overview.png
    │   │   ├── tf-stats.png
    │   │   ├── top10-ops.png
    │   │   ├── trace-viewer.png
    │   │   ├── trace-zoomed-float32.png
    │   │   └── trace-zoomed.png
    │   └── train_MNIST_tf_function_XLA_mixed.py
    ├── tf_function
    │   ├── README.md
    │   ├── train_MNIST_tf_function.py
    │   ├── train_MNIST_tf_function_XLA.py
    │   └── xla_bug_generated
    │   │   ├── 1628017669967300.module_0000.after_optimizations-buffer-assignment.txt
    │   │   ├── 1628017669967300.module_0000.after_optimizations.txt
    │   │   ├── 1628017669967300.module_0000.before_optimizations.txt
    │   │   ├── before_increase_dynamism_for_auto_jit_pass.pbtxt
    │   │   ├── before_increase_dynamism_for_auto_jit_pass_1.pbtxt
    │   │   ├── before_mark_for_compilation.pbtxt
    │   │   ├── mark_for_compilation.pbtxt
    │   │   ├── mark_for_compilation_annotated.pbtxt
    │   │   ├── module_0000.ir-no-opt-noconst.ll
    │   │   ├── module_0000.ir-no-opt.ll
    │   │   ├── module_0000.ir-with-opt-noconst.ll
    │   │   ├── module_0000.ir-with-opt.ll
    │   │   ├── module_0000.ptx
    │   │   └── module_0000.thunk_schedule
    ├── tf_profiler
    │   ├── README.md
    │   ├── images
    │   │   ├── kernel-stats.png
    │   │   ├── profiler_overview.png
    │   │   ├── tf-stats.png
    │   │   ├── top10_ops.png
    │   │   ├── trace-viewer-zoom.png
    │   │   └── trace-viewer.png
    │   └── train_MNIST_tf_function_XLA.py
    ├── train_MNIST.py
    └── train_MNIST_optimized.py
├── 04_distributedLearning
    ├── ATPESC_2024_AIMLTrack_DDL_Zheng.pdf
    ├── DDP
    │   └── 04_pytorch_cnn_ddp.py
    ├── DeepSpeed
    │   ├── 04_pytorch_cnn_ds.py
    │   └── ds_config.json
    ├── Horovod
    │   ├── 04_keras_cnn_concise.py
    │   ├── 04_keras_cnn_concise_hvd.py
    │   ├── 04_keras_cnn_verbose.py
    │   ├── 04_keras_cnn_verbose_hvd.py
    │   ├── 04_pytorch_cnn.py
    │   ├── 04_pytorch_cnn_hvd.py
    │   ├── HorovodTimeline
    │   │   ├── cpu.json
    │   │   └── gpu.json
    │   └── mpitrace
    │   │   ├── cpu
    │   │       ├── #mpi_profile.2500161.0#
    │   │       ├── mpi_profile.2500161.0
    │   │       ├── mpi_profile.2500161.1
    │   │       ├── mpi_profile.2500161.4
    │   │       └── mpi_profile.2500161.5
    │   │   └── gpu
    │   │       ├── mpi_profile.2497205.0
    │   │       ├── mpi_profile.2497205.2
    │   │       ├── mpi_profile.2497205.5
    │   │       └── mpi_profile.2497205.6
    ├── README.md
    ├── figures
    │   ├── Horovod.png
    │   ├── cpu_horovodtimeline.png
    │   ├── distributed.png
    │   ├── gpu_horovodtimeline.png
    │   └── resnet50.png
    ├── results
    │   ├── concise_1.out
    │   ├── concise_2.out
    │   ├── concise_4.out
    │   └── concise_8.out
    └── submissions
    │   ├── qsub_polaris.sc
    │   └── qsub_thetagpu.sc
├── 05_aiTestbed
    ├── .DS_Store
    ├── Cerebras
    │   ├── Cerebras_Wafer-Scale_Cluster_login_diagram.png
    │   ├── README.md
    │   └── gpt-j.md
    ├── Graphcore
    │   ├── README.md
    │   ├── gpt2.md
    │   └── graphcore_login.png
    ├── Groq
    │   ├── README.md
    │   ├── groqrack_system_diagram.png
    │   └── minilm.md
    ├── README.md
    ├── SambaNova
    │   ├── README.md
    │   ├── gpt15b.md
    │   └── sambanova_login.jpg
    └── Slides
    │   └── .DS_Store
├── README.md
├── README_imgs
    ├── .DS_Store
    ├── colab_clean_page.png
    ├── colab_open_github.png
    ├── colab_start_page.png
    └── collab_start_page1.png
├── extra_statisticalLearning
    ├── README.md
    ├── assets
    │   ├── atpesc-k-means-step1.svg
    │   ├── atpesc-k-means-step2.svg
    │   ├── atpesc-k-means-step3.svg
    │   ├── atpesc-k-means-step4.svg
    │   ├── learning-rate.svg
    │   └── pngs
    │   │   ├── atpesc-k-means-step1.png
    │   │   ├── atpesc-k-means-step2.png
    │   │   ├── atpesc-k-means-step3.png
    │   │   ├── atpesc-k-means-step4.png
    │   │   └── learning-rate.png
    ├── data
    │   ├── realestate_train.csv
    │   └── rna.csv
    ├── images
    │   ├── image.png
    │   ├── kmeans.png
    │   ├── mse.pdf
    │   ├── mse.png
    │   ├── mse.svg
    │   ├── sgd.pdf
    │   ├── sgd.png
    │   ├── sgd.svg
    │   └── sgd_example.png
    ├── pyproject.toml
    ├── setup.cfg
    └── src
    │   └── atpesc
    │       ├── __init__.py
    │       ├── common.py
    │       ├── notebooks
    │           └── statistical_learning.ipynb
    │       └── utils
    │           └── plots.py
└── introduction.ipynb


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | 


--------------------------------------------------------------------------------
/01_deepLearning/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/.DS_Store


--------------------------------------------------------------------------------
/01_deepLearning/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction to deep learning
 2 | ATPESC 2024
 3 | 
 4 | Author: Bethany Lusch (blusch@anl.gov), adapting materials from Marieme Ngom, Prasanna Balaprakash, Taylor Childers, Corey Adams, and Kyle Felker.
 5 | 
 6 | This is a hands-on introduction to deep learning, a machine learning technique that tends to outperform other techniques when dealing with a large amount of data. 
 7 | 
 8 | This is a quick overview, but the goals are:
 9 | - to introduce the fundamental concepts of deep learning through hands-on activities
10 | - to give you the necessary background for the more advanced topics on scaling and performance that we will teach later today.
11 | 
12 | Some rough definitions:
13 | 
14 | - Artificial intelligence (AI) is a set of approaches to solving complex problems by imitating the brain's ability to learn.
15 | - Machine learning (ML) is the field of study that gives computers the ability to learn without being explicitly programmed (i.e. learning patterns instead of writing down rules.) Arguably, machine learning is now a subfield of AI.
16 | 
17 | 
18 | Ready for more?
19 | - Here are some of our longer training materials: https://www.alcf.anl.gov/alcf-ai-science-training-series
20 | - Here's a thorough hands-on textbook: [book](https://www.oreilly.com/library/view/hands-on-machine-learning/9781492032632/) with [notebooks](https://github.com/ageron/handson-ml2).
21 | 
22 | We will work on a classification problem involving the [MNIST dataset](http://yann.lecun.com/exdb/mnist/) that contains thousands of examples of handwritten numbers, with each digit labeled 0-9. The model is learning to "classify" images as one of ten classes.
23 | ![MNIST Task](images/mnist_task.png)
24 | 
25 | We are going to run Jupyter notebooks. You can run them in Google Colab (see instructions [here](../README.md)). If that's a problem you can also use your own computer or ALCF's [JupyterHub](https://docs.alcf.anl.gov/services/jupyter-hub/).
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/01_deepLearning/images/LinearModel_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/LinearModel_1.png


--------------------------------------------------------------------------------
/01_deepLearning/images/MaxpoolSample2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/MaxpoolSample2.png


--------------------------------------------------------------------------------
/01_deepLearning/images/MnistExamples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/MnistExamples.png


--------------------------------------------------------------------------------
/01_deepLearning/images/Padding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/Padding.png


--------------------------------------------------------------------------------
/01_deepLearning/images/ResNet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/ResNet.png


--------------------------------------------------------------------------------
/01_deepLearning/images/U-Nets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/U-Nets.png


--------------------------------------------------------------------------------
/01_deepLearning/images/ViT.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/ViT.gif


--------------------------------------------------------------------------------
/01_deepLearning/images/_placeholder:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/01_deepLearning/images/activation.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/activation.jpeg


--------------------------------------------------------------------------------
/01_deepLearning/images/activation_functions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/activation_functions.png


--------------------------------------------------------------------------------
/01_deepLearning/images/bias_vs_variance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/bias_vs_variance.png


--------------------------------------------------------------------------------
/01_deepLearning/images/conv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/conv.png


--------------------------------------------------------------------------------
/01_deepLearning/images/convNN_BlockDiagram.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/convNN_BlockDiagram.jpeg


--------------------------------------------------------------------------------
/01_deepLearning/images/conv_layer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/conv_layer.png


--------------------------------------------------------------------------------
/01_deepLearning/images/convnets-feature-maps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/convnets-feature-maps.png


--------------------------------------------------------------------------------
/01_deepLearning/images/deep_nn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/deep_nn.png


--------------------------------------------------------------------------------
/01_deepLearning/images/dropout.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/dropout.png


--------------------------------------------------------------------------------
/01_deepLearning/images/learning-rate-gradient-descent.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/learning-rate-gradient-descent.jpeg


--------------------------------------------------------------------------------
/01_deepLearning/images/lr.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/lr.jpeg


--------------------------------------------------------------------------------
/01_deepLearning/images/mnist_task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/mnist_task.png


--------------------------------------------------------------------------------
/01_deepLearning/images/multiple_channels.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/multiple_channels.png


--------------------------------------------------------------------------------
/01_deepLearning/images/nn1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/nn1.pdf


--------------------------------------------------------------------------------
/01_deepLearning/images/nn1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/nn1.png


--------------------------------------------------------------------------------
/01_deepLearning/images/nonconvex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/nonconvex.png


--------------------------------------------------------------------------------
/01_deepLearning/images/one_more_layer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/one_more_layer.png


--------------------------------------------------------------------------------
/01_deepLearning/images/optimization_types.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/optimization_types.png


--------------------------------------------------------------------------------
/01_deepLearning/images/shallow_nn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/shallow_nn.png


--------------------------------------------------------------------------------
/01_deepLearning/images/test_data_rule.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/test_data_rule.png


--------------------------------------------------------------------------------
/01_deepLearning/images/three_layer_network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/three_layer_network.png


--------------------------------------------------------------------------------
/01_deepLearning/images/tiny_network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/01_deepLearning/images/tiny_network.png


--------------------------------------------------------------------------------
/02_dataPipelines/00_tensorflowDatasetAPI/README.md:
--------------------------------------------------------------------------------
 1 | # Tensorflow Example Data Pipeline
 2 | by J. Taylor Childers (jchilders@anl.gov)
 3 | 
 4 | This is the example impliments a data pipeline for the ImageNet dataset described in the [README](../README.md) one level up.
 5 | 
 6 | Example submission script for Polaris is provided.
 7 | 
 8 | Submit to Polaris using:
 9 | ```bash
10 | qsub -A <project> -q <queue> submit_polaris.sh
11 | ```
12 | 
13 | All log files go into the `logdir` folder.
14 | 
15 | 
16 | # Profiler View
17 | 
18 | You can view the processes and how they occupy the compute resources in Tensorflow using tensorboard.
19 | 
20 | You can login to Polaris using:
21 | ```bash
22 | # our proxy port, must be > 1024
23 | export PORT=10001
24 | # login to theta with a port forwarding
25 | ssh -D $PORT user@polaris.alcf.anl.gov
26 | # load any conda environment that has a compatible tensorboard installation
27 | module use /soft/modulefiles
28 | module load conda
29 | conda activate
30 | cd <path/to/dataPipelines/00_tensorflowDatasetAPI/>
31 | # start tensorboard (load_fast==false is a recent setting that seems to be needed until Tensorflow work's out the bugs)
32 | tensorboard --bind_all --logdir logdir
33 | ```
34 | Note the Port number that `tensorboard` reports when it starts up.
35 | 
36 | Only 1 user can use a specific port so if you get an error choose another port number larger than `1024`.
37 | 
38 | Once you have that setup. Set the Socks5 proxy of your favorite browser to host `localhost` and port `$PORT` (where `$PORT` is the value you used in the above script, like `10001`). Now in the browser URL enter the login node on which you started `tensorboard`. For instance, now you can type in `localhost:6006`. Here `6006` is the port that `tensorboard` uses by default to start up it's web service, but may vary if that port is already used so note the port reported by `tensorboard` when it starts.
39 | 


--------------------------------------------------------------------------------
/02_dataPipelines/00_tensorflowDatasetAPI/ilsvrc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "data": {
 3 |       "handler":                       "ilsvrc_dataset",
 4 |       "batch_size":                    128,
 5 |       "train_filelist":                "/lus/eagle/projects/datasets/ImageNet/ILSVRC/ilsvrc_train_filelist.txt",
 6 |       "test_filelist":                 "/lus/eagle/projects/datasets/ImageNet/ILSVRC/ilsvrc_val_filelist.txt",
 7 |       "shuffle_buffer":                200000,
 8 |       "reshuffle_each_iteration":      true,
 9 |       "num_parallel_readers":          8,
10 |       "prefetch_buffer_size":          1,
11 |       "crop_image_size":               [256,256],
12 |       "num_classes":                   1000,
13 |       "num_channels":                  3
14 |    }
15 | }
16 | 


--------------------------------------------------------------------------------
/02_dataPipelines/00_tensorflowDatasetAPI/logdir/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/02_dataPipelines/00_tensorflowDatasetAPI/submit_polaris.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #PBS -l select=1
 3 | #PBS -l walltime=00:20:00
 4 | #PBS -l filesystems=eagle:home_fs
 5 | #PBS -q debug
 6 | #PBS -o logdir/
 7 | #PBS -e logdir/
 8 | 
 9 | cd $PBS_O_WORKDIR
10 | 
11 | echo [$SECONDS] setup conda environment
12 | module use /soft/modulefiles
13 | module load conda
14 | conda activate
15 | 
16 | echo [$SECONDS] python = $(which python)
17 | echo [$SECONDS] python version = $(python --version)
18 | 
19 | echo [$SECONDS] setup local env vars
20 | NODES=`cat $PBS_NODEFILE | wc -l`
21 | GPUS_PER_NODE=4
22 | RANKS=$((NODES * GPUS_PER_NODE))
23 | echo NODES=$NODES  GPUS_PER_NODE=$GPUS_PER_NODE  RANKS=$RANKS
24 | 
25 | export OMP_NUM_THREADS=1
26 | echo [$SECONDS] run example with $OMP_NUM_THREADS threads
27 | mpiexec -n $RANKS --ppn $GPUS_PER_NODE --depth=$OMP_NUM_THREADS --cpu-bind depth --env OMP_NUM_THREADS=$OMP_NUM_THREADS -env OMP_PLACES=threads \
28 |    python ilsvrc_dataset.py -c ilsvrc.json --interop $OMP_NUM_THREADS --intraop $OMP_NUM_THREADS \
29 |        --logdir logdir
30 | 
31 | export OMP_NUM_THREADS=16
32 | echo [$SECONDS] run example with $OMP_NUM_THREADS threads
33 | mpiexec -n $RANKS --ppn $GPUS_PER_NODE --depth=$OMP_NUM_THREADS --cpu-bind depth --env OMP_NUM_THREADS=$OMP_NUM_THREADS -env OMP_PLACES=threads \
34 |    python ilsvrc_dataset.py -c ilsvrc.json --interop $OMP_NUM_THREADS --intraop $OMP_NUM_THREADS \
35 |        --logdir logdir
36 | 
37 | export OMP_NUM_THREADS=64
38 | echo [$SECONDS] run example with $OMP_NUM_THREADS threads
39 | mpiexec -n $RANKS --ppn $GPUS_PER_NODE --depth=$OMP_NUM_THREADS --cpu-bind depth --env OMP_NUM_THREADS=$OMP_NUM_THREADS -env OMP_PLACES=threads \
40 |    python ilsvrc_dataset.py -c ilsvrc.json --interop $OMP_NUM_THREADS --intraop $OMP_NUM_THREADS \
41 |        --logdir logdir
42 | 
43 | echo [$SECONDS] done
44 | 


--------------------------------------------------------------------------------
/02_dataPipelines/01_pytorchDatasetAPI/imagenet_serial.py:
--------------------------------------------------------------------------------
  1 | # This example builds a serial data pipeline to use as an example
  2 | import os
  3 | import time
  4 | import argparse
  5 | import datetime
  6 | import time
  7 | total_start = time.time()
  8 | from PIL import Image
  9 | import numpy as np
 10 | import torch
 11 | from torchvision import transforms, models
 12 | 
 13 | 
 14 | # simple class to calculate mean and standard deviation
 15 | class MeanCalc:
 16 |    def __init__(self):
 17 |       self.sum = 0
 18 |       self.sum2 = 0
 19 |       self.n = 0
 20 | 
 21 |    def add(self, x):
 22 |       self.sum += x
 23 |       self.sum2 += x * x
 24 |       self.n += 1
 25 |    
 26 |    def mean(self):
 27 |       return self.sum / self.n
 28 | 
 29 |    def stddev(self):
 30 |       return np.sqrt(self.sum2 / self.n - self.mean()*self.mean())
 31 |    
 32 |    def __str__(self):
 33 |       return f'mean: {self.mean():.2f}, stddev: {self.stddev():.2f}'
 34 | 
 35 | # dataset handler for input files
 36 | class ImageNetDataset:
 37 |    def __init__(self, base_dir, file_list_path, id_to_index, transform=None):
 38 |       self.base_dir = base_dir
 39 |       self.transform = transform
 40 |       self.id_to_index = id_to_index
 41 |       with open(file_list_path, 'r') as file:
 42 |          self.image_paths = [line.strip() for line in file]
 43 | 
 44 |    def __len__(self):
 45 |       return len(self.image_paths)
 46 | 
 47 |    def load_image(self, idx):
 48 |       img_path = self.image_paths[idx]
 49 |       img = Image.open(img_path).convert('RGB')
 50 |       unique_id = img_path.split('/')[-2]
 51 |       target = self.id_to_index[unique_id]
 52 |       if self.transform:
 53 |          img = self.transform(img)
 54 |       return img, target
 55 | 
 56 | # the filenames contain a unique id for each image that corresponds to the object ID
 57 | # create a hash table for labels from string to int
 58 | def build_id_to_index_mapping(file_list_path):
 59 |    unique_ids = set()
 60 |    with open(file_list_path, 'r') as file:
 61 |       for line in file:
 62 |          unique_id = line.strip().split('/')[-2]
 63 |          unique_ids.add(unique_id)
 64 |    return {unique_id: idx for idx, unique_id in enumerate(sorted(unique_ids))}
 65 | 
 66 | # transform to resize and convert to tensor
 67 | transform = transforms.Compose([
 68 |    transforms.Resize((256, 256)),
 69 |    transforms.ToTensor(),
 70 | ])
 71 | 
 72 | def main():
 73 | 
 74 |    # parse arguments
 75 |    parser = argparse.ArgumentParser()
 76 |    parser.add_argument('-b','--nbatch', type=int, default=64)
 77 |    parser.add_argument('-s','--nsteps', type=int, default=20)
 78 |    parser.add_argument('--profile', action='store_true',default=False)
 79 |    parser.add_argument('--base-dir', type=str, default='/lus/eagle/projects/datasets/ImageNet/ILSVRC')
 80 |    parser.add_argument('--file-list-path', type=str, default='ilsvrc_train_filelist.txt')
 81 |    parser.add_argument('--status-print-interval', type=int, default=5)
 82 |    args = parser.parse_args()
 83 | 
 84 |    # setup dataset 
 85 |    base_dir = args.base_dir
 86 |    file_list_path = os.path.join(base_dir, args.file_list_path)
 87 |    id_to_index = build_id_to_index_mapping(file_list_path)
 88 |    dataset = ImageNetDataset(base_dir, file_list_path, id_to_index, transform=transform)
 89 | 
 90 |    # run settings
 91 |    batch_size = args.nbatch
 92 |    print(f'Batch size: {batch_size}')
 93 |    total_steps = args.nsteps
 94 |    status_print_interval = args.status_print_interval
 95 |    profile = args.profile
 96 | 
 97 |    # set device to gpu if available
 98 |    device = torch.device(f'cuda:0' if torch.cuda.is_available() else "cpu")
 99 | 
100 |    # Create a model to use as an example
101 |    model = models.resnet50(weights='IMAGENET1K_V1')
102 |    model.to(device)
103 |    criterion = torch.nn.CrossEntropyLoss().to(device)
104 |    optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
105 |    model.train()
106 | 
107 |    # set profile path to be ./logdir/profiler/ + date-time
108 |    log_path = os.path.join('./logdir', datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))
109 | 
110 |    # create pytorch profiler that outputs TensorBoard logs
111 |    prof = torch.profiler.profile(
112 |          schedule=torch.profiler.schedule(wait=20, warmup=1, active=20, repeat=1),
113 |          activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
114 |          # on_trace_ready=torch.profiler.tensorboard_trace_handler(log_path),
115 |          record_shapes=True,
116 |          profile_memory=True,
117 |          with_stack=True
118 |    )
119 | 
120 |    # start the profiler
121 |    if profile: prof.start()
122 | 
123 |    step_time = time.time()
124 |    step = 0
125 |    image_rate = MeanCalc()
126 | 
127 |    # training loop
128 |    while step < total_steps:
129 |       images, targets = [], []
130 |       # build an input batch serially
131 |       for _ in range(batch_size):
132 |          if step * batch_size + _ >= len(dataset):
133 |             break
134 |          img, target = dataset.load_image(step * batch_size + _)
135 |          images.append(img)
136 |          targets.append(target)
137 |       if len(images) == 0:
138 |          break
139 |       # convert to pytorch tensors
140 |       images = torch.stack(images).to(device)
141 |       targets = torch.tensor(targets).to(device)
142 |       step += 1
143 |       
144 |       # pass the batch through the model
145 |       outputs = model(images)
146 |       loss = criterion(outputs, targets)
147 |       optimizer.zero_grad()
148 |       loss.backward()
149 |       optimizer.step()
150 | 
151 |       if profile: prof.step()
152 |       
153 |       # print status
154 |       if step % status_print_interval == 0:
155 |          step_img_rate = status_print_interval * batch_size / (time.time() - step_time)
156 |          print(f'step: {step}; step_img_rate: {step_img_rate:.2f}')
157 |          if step > 5:
158 |             image_rate.add(step_img_rate)
159 |          step_time = time.time()
160 |    
161 |    if profile: prof.stop()
162 |    if profile: prof.export_chrome_trace("imagenet_serial.json")
163 |    print(f'Average image rate: {str(image_rate)}')
164 | 
165 |    print(f'All Done; total runtime: {time.time() - total_start:.2f}')
166 | 
167 | if __name__ == '__main__':
168 |    main()
169 | 


--------------------------------------------------------------------------------
/02_dataPipelines/01_pytorchDatasetAPI/logdir/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 | 


--------------------------------------------------------------------------------
/02_dataPipelines/01_pytorchDatasetAPI/submit_polaris.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #PBS -l select=1
 3 | #PBS -l walltime=00:20:00
 4 | #PBS -l filesystems=eagle:home_fs
 5 | #PBS -q debug
 6 | #PBS -o logdir/
 7 | #PBS -e logdir/
 8 | 
 9 | cd $PBS_O_WORKDIR
10 | 
11 | echo [$SECONDS] setup conda environment
12 | module use /soft/modulefiles
13 | module load conda
14 | conda activate
15 | 
16 | echo [$SECONDS] python = $(which python)
17 | echo [$SECONDS] python version = $(python --version)
18 | 
19 | echo [$SECONDS] setup local env vars
20 | NODES=`cat $PBS_NODEFILE | wc -l`
21 | GPUS_PER_NODE=4
22 | RANKS=$((NODES * GPUS_PER_NODE))
23 | echo NODES=$NODES  GPUS_PER_NODE=$GPUS_PER_NODE  RANKS=$RANKS
24 | 
25 | # for PyTorch DDT setup
26 | export MASTER_ADDR="localhost"
27 | export MASTER_PORT=12399
28 | echo [$SECONDS] MASTER_ADDR=$MASTER_ADDR  MASTER_PORT=$MASTER_PORT
29 | 
30 | BATCH_SIZE=64
31 | NSTEPS=100
32 | PROFILE=--profile
33 | # PROFILE=
34 | echo [$SECONDS] using batch size $BATCH_SIZE and $NSTEPS steps
35 | 
36 | echo [$SECONDS] run serial example
37 | python imagenet_serial.py -b $BATCH_SIZE -s $NSTEPS $PROFILE
38 | 
39 | 
40 | NWORKERS=1
41 | echo [$SECONDS] run parallel with $NWORKERS workers
42 | mpiexec -n $RANKS --ppn $GPUS_PER_NODE --depth=16 --cpu-bind depth python imagenet_parallel.py -b $BATCH_SIZE -s $NSTEPS -w $NWORKERS $PROFILE
43 | 
44 | NWORKERS=2
45 | echo [$SECONDS] run parallel with $NWORKERS workers
46 | mpiexec -n $RANKS --ppn $GPUS_PER_NODE --depth=16 --cpu-bind depth python imagenet_parallel.py -b $BATCH_SIZE -s $NSTEPS -w $NWORKERS $PROFILE
47 | 
48 | NWORKERS=3
49 | echo [$SECONDS] run parallel with $NWORKERS workers
50 | mpiexec -n $RANKS --ppn $GPUS_PER_NODE --depth=16 --cpu-bind depth python imagenet_parallel.py -b $BATCH_SIZE -s $NSTEPS -w $NWORKERS $PROFILE
51 | 
52 | NWORKERS=4
53 | echo [$SECONDS] run parallel with $NWORKERS workers
54 | mpiexec -n $RANKS --ppn $GPUS_PER_NODE --depth=16 --cpu-bind depth python imagenet_parallel.py -b $BATCH_SIZE -s $NSTEPS -w $NWORKERS $PROFILE
55 | 
56 | echo [$SECONDS] done
57 | 


--------------------------------------------------------------------------------
/02_dataPipelines/README.md:
--------------------------------------------------------------------------------
 1 | # Building a CPU-side data pipeline
 2 | 
 3 | Author: J. Taylor Childers (jchilders@anl.gov)
 4 | 
 5 | ## Learning Goals:
 6 | - Using the CPU on a system to build data batches in parallel as ML calculations are performed on the GPU
 7 | - Using parallel processes on CPU to speed up the data pipeline process.
 8 | - Do all this using the framework's (PyTorch, Tensorflow) data APIs
 9 | 
10 | New AI systems largely depend on CPU-GPU hybrid architectures. This makes efficient use of CPU-side resources important in order to feed sufficient data to the GPU algorithms. Ideally, the CPU processes data and builds training batches, while the GPU performs the compute intensive forward and backward gradient calculations.
11 | 
12 | Here there are examples of building a data pipeline for both Tensorflow and PyTorch. Tensorflow's data pipeline API is a bit more advanced than PyTorch so we'll focus on that one, though we include an example in PyTorch.
13 | 
14 | # ImageNet Dataset
15 | 
16 | This example uses the ImageNet dataset to build training batches.
17 | 
18 | ![Turtle](images/n01667778_12001.JPEG) ![Dog](images/n02094114_1205.JPEG)
19 | 
20 | This dataset includes JPEG images and an XML annotation for each file that defines a bounding box for each class. Building a training batch requires pre-processing the images and annotations. In our example, we have created text files that list all the files in the training set and validation set. For each text file, we need to use the input JPEG files and build tensors that include multiple images per training batch.
21 | 
22 | # Tensorflow Dataset example
23 | 
24 | Tensorflow has some very nice tools to help us build the pipeline. You'll find the [example here](00_tensorflowDatasetAPI/ilsvrc_dataset.py).
25 | 
26 | ## Build from file list
27 | We'll start in the function `build_dataset_from_filelist`.
28 | 
29 | 1. Open the filelist
30 | ```python
31 | # loading full filelist
32 | filelist = []
33 | with open(filelist_filename) as file:
34 |    for line in file:
35 |       filelist.append(line.strip())
36 | ```
37 | 2. Parse the list of files into a TF Tensor
38 | ```python
39 | filelist = tf.data.Dataset.from_tensor_slices(filelist)
40 | ```
41 | 3. If we are using Horovod for MPI parallelism, we want to "shard" the data across nodes so each node processes unique data
42 | ```python
43 | filelist = filelist.shard(config['hvd'].size(), config['hvd'].rank())
44 | ```
45 | 4. Shuffle our filelist at each epoch barrier
46 | ```python
47 | filelist = filelist.shuffle(dc['shuffle_buffer'],reshuffle_each_iteration=dc['reshuffle_each_iteration'])
48 | ```
49 | 5. Run a custom function on the filelist, which effectively opens the JPEG file, loads the data into a TF Tensor and extracts the class labels. If there are multiple objects in the image, this function will return more than one image using the bounding boxes. `num_parallel_calls` allows this function to run in parallel so many JPEG files can be read into memory and processed in parallel threads.
50 | ```python
51 | ds = filelist.map(load_image_label_bb,
52 |                   num_parallel_calls=tf.data.experimental.AUTOTUNE)
53 | ```
54 | 6. Since the previous map fuction may return one or more images, we need to unbatch the output before we batch it into our fixed batch size
55 | ```python
56 | ds = ds.apply(tf.data.Dataset.unbatch)
57 | ds = ds.batch(dc['batch_size'])
58 | ```
59 | 7. Tell the dataset it can prepare the next batch(es) prior to them being requested
60 | ```python
61 | ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
62 | ```
63 | 
64 | Done.
65 | 
66 | We can now iterate over this dataset in a loop:
67 | ```python
68 | for inputs,labels in ds:
69 |    prediction = model(inputs)
70 |    loss = loss_func(prediction,labels)
71 |    # ...
72 | ```
73 | 
74 | ## Parallel Processing on Polaris
75 | 
76 | The example `00_tensorflowDatasetAPI/ilsvrc_dataset.py` can be run via
77 | ```bash
78 | cd 00_tensorflowDatasetAPI
79 | qsub -A <project> -q debug submit_polaris.sh
80 | ```   
81 | 
82 | This script will run the example 3 times with 1 thread (no parallelism), 16 threads, and 64 threads per MPI process. The reported `imgs/sec` throughput will be lowest for serial processing and highest for the 64 threads per MPI process. You can see in this screenshot from the [Tensorflow Profiler](https://www.tensorflow.org/tensorboard/tensorboard_profiling_keras) how processes are being utilized. 
83 | 
84 | This profile shows the single process handling all the data pipeline processes. All data pipeline calls are being done serially when they could be done in parallel. It takes over 3 seconds to prepare a batch of images.
85 | ![serial](images/ilsvrc_serial.png)
86 | 
87 | In the case of 64-threads per MPI process, batch processing time is down to 0.08 seconds. The profiler shows we are running with our 64 parallel processes, all of which are opening JPEGs, processing them into tensors, extracting truth information, and so on. Once can see the `ReadFile` operation taking place in parallel which opens the jpeg and reads in the data to memory. This operation is the most time consuming in this pipeline and by parallelizing it, we have improved our throughput.
88 | ![parallel](images/ilsvrc_64threads_zoom.png)
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/02_dataPipelines/images/ilsvrc_64threads.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/02_dataPipelines/images/ilsvrc_64threads.png


--------------------------------------------------------------------------------
/02_dataPipelines/images/ilsvrc_64threads_zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/02_dataPipelines/images/ilsvrc_64threads_zoom.png


--------------------------------------------------------------------------------
/02_dataPipelines/images/ilsvrc_parallel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/02_dataPipelines/images/ilsvrc_parallel.png


--------------------------------------------------------------------------------
/02_dataPipelines/images/ilsvrc_serial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/02_dataPipelines/images/ilsvrc_serial.png


--------------------------------------------------------------------------------
/02_dataPipelines/images/ilsvrc_serial_zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/02_dataPipelines/images/ilsvrc_serial_zoom.png


--------------------------------------------------------------------------------
/02_dataPipelines/images/n01667778_12001.JPEG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/02_dataPipelines/images/n01667778_12001.JPEG


--------------------------------------------------------------------------------
/02_dataPipelines/images/n02094114_1205.JPEG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/02_dataPipelines/images/n02094114_1205.JPEG


--------------------------------------------------------------------------------
/02_dataPipelines/images/pytorch_threading.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/02_dataPipelines/images/pytorch_threading.png


--------------------------------------------------------------------------------
/03_introlangmodels/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction to Language models
 2 | 
 3 | Author: Archit Vasan (avasan@anl.gov), including and adapting materials and discussions over time by Varuni Sastri, Carlo Graziani, Taylor Childers, Venkat Vishwanath, Jay Alammar and Kevin Gimpel.
 4 | 
 5 | This tutorial introduces sequential data modeling, tokenization methods and embeddings and attempts to demystify aspects of the Transformer model architecture.
 6 | 
 7 | We will refer to this notebook:
 8 | 
 9 | [Introduction to language models](https://github.com/argonne-lcf/ATPESC_MachineLearning/edit/master/03_introlangmodels/03_languagemodels.ipynb)
10 | 
11 | The discussion will include:
12 | * tokenization
13 | * token embedding
14 | * positional encodings,
15 | * attention mechanisms,
16 | * output layers,
17 | * and training loops.
18 | 
19 | We are first going to introduce sequential data modeling, tokenization and then use "text-generation" using the popular GPT-2 model and the Hugging Face pipeline. Then we are going to code the model elements of a simple LLM from scratch and train this ourselves.
20 | 
21 | ## Environment Setup
22 | 1. If you are using ALCF, first log in. From a terminal run the following command:
23 | ```
24 | ssh username@polaris.alcf.anl.gov
25 | ```
26 | 
27 | 2. We will be downloading data in our Jupyter notebook, which runs on hardware that by default has no Internet access. From the terminal on Polaris, edit the ~/.bash_profile file to have these proxy settings:
28 | ```
29 | export HTTP_PROXY="http://proxy-01.pub.alcf.anl.gov:3128"
30 | export HTTPS_PROXY="http://proxy-01.pub.alcf.anl.gov:3128"
31 | export http_proxy="http://proxy-01.pub.alcf.anl.gov:3128"
32 | export https_proxy="http://proxy-01.pub.alcf.anl.gov:3128"
33 | export ftp_proxy="http://proxy-01.pub.alcf.anl.gov:3128"
34 | export no_proxy="admin,polaris-adminvm-01,localhost,*.cm.polaris.alcf.anl.gov,polaris-*,*.polaris.alcf.anl.gov,*.alcf.anl.gov"
35 | ```
36 | 
37 | 4. Now that we have the updated notebooks, we can open them. If you are using ALCF JupyterHub or Google Colab, you can be reminded of the steps [here](https://github.com/argonne-lcf/ai-science-training-series/blob/main/01_intro_AI_on_Supercomputer/01_linear_regression_sgd.ipynb). 
38 | 
39 | 5. Reminder: Change the notebook's kernel to `datascience/conda-2023-01-10` (you may need to change kernel each time you open a notebook for the first time):
40 | 
41 |     1. select *Kernel* in the menu bar
42 |     2. select *Change kernel...*
43 |     3. select *datascience/conda-2024-04-29* from the drop-down menu
44 | 
45 | ## __Exciting example:__
46 | 
47 | Here is an image of GenSLM described earlier by Arvind Ramanathan. This is a language model that can model genomic information in a single model. It was shown to model the evolution of SARS-COV2 without expensive experiments.
48 | ![GenSLM](images/genslm.png)
49 | 
50 | ## __References:__
51 | 
52 | Here are some recommendations for further reading and additional code for review.
53 | 
54 | * ["The Illustrated Transformer"](https://jalammar.github.io/illustrated-transformer/) by Jay Alammar
55 | * ["Visualizing A Neural Machine Translation Model (Mechanics of Seq2seq Models With Attention)"](https://jalammar.github.io/visualizing-neural-machine-translation-mechanics-of-seq2seq-models-with-attention/) 
56 | * ["The Illustrated GPT-2 (Visualizing Transformer Language Models)"](https://jalammar.github.io/illustrated-gpt2/)
57 | * ["LLM Tutorial Workshop (Argonne National Laboratory)"](https://github.com/brettin/llm_tutorial/tree/main)
58 | * ["LLM Tutorial Workshop Part 2 (Argonne National Laboratory)"](https://github.com/argonne-lcf/llm-workshop)
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_input.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_input.txt


--------------------------------------------------------------------------------
/03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_input.txt.lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_input.txt.lock


--------------------------------------------------------------------------------
/03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_test_input.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_test_input.txt


--------------------------------------------------------------------------------
/03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_test_input.txt.lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_test_input.txt.lock


--------------------------------------------------------------------------------
/03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_train_input.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_train_input.txt


--------------------------------------------------------------------------------
/03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_train_input.txt.lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/dataset/cached_lm_GPT2TokenizerFast_128_train_input.txt.lock


--------------------------------------------------------------------------------
/03_introlangmodels/images/Attention_Vis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/Attention_Vis.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/BERT_Explanation.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/BERT_Explanation.webp


--------------------------------------------------------------------------------
/03_introlangmodels/images/BERT_input_sent.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/BERT_input_sent.webp


--------------------------------------------------------------------------------
/03_introlangmodels/images/Byte_Pair_enc.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/Byte_Pair_enc.webp


--------------------------------------------------------------------------------
/03_introlangmodels/images/Graphformer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/Graphformer.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/LLM_Architectures.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/LLM_Architectures.webp


--------------------------------------------------------------------------------
/03_introlangmodels/images/LLM_Blackbox.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/LLM_Blackbox.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/LLM_Theoret_Apps.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/LLM_Theoret_Apps.pptx


--------------------------------------------------------------------------------
/03_introlangmodels/images/Lorem.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/Lorem.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/Postitional_Embedding.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/Postitional_Embedding.webp


--------------------------------------------------------------------------------
/03_introlangmodels/images/Protein-Structure-06.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/Protein-Structure-06.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/RNA-codons.svg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/RNA-codons.svg.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/The_transformer_encoder_decoder_stack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/The_transformer_encoder_decoder_stack.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/The_transformer_encoders_decoders.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/The_transformer_encoders_decoders.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/Transformer_Arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/Transformer_Arch.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/Transformer_Enc_Dec_Blocks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/Transformer_Enc_Dec_Blocks.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/Transformer_decoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/Transformer_decoder.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/WordPieceTok.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/WordPieceTok.webp


--------------------------------------------------------------------------------
/03_introlangmodels/images/attention_is_all_you_need.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/attention_is_all_you_need.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/chars-tokenization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/chars-tokenization.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/decoder_only_block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/decoder_only_block.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/encode_decode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/encode_decode.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/encoder_with_tensors_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/encoder_with_tensors_2.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/genslm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/genslm.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/gpt-2-layers-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/gpt-2-layers-2.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/gpt2-output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/gpt2-output.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/gpt2-self-attention-example-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/gpt2-self-attention-example-2.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/one-hot-vocabulary-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/one-hot-vocabulary-example.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/output_target_probability_distributions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/output_target_probability_distributions.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/positional_encoding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/positional_encoding.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/recurrent_nn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/recurrent_nn.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/rnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/rnn.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/self-attention-and-masked-self-attention.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/self-attention-and-masked-self-attention.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/self-attention-example-folders-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/self-attention-example-folders-3.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/self-attention-output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/self-attention-output.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/text-processing---machines-vs-humans.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/text-processing---machines-vs-humans.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/the_transformer_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/the_transformer_3.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/tokenize.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/tokenize.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/tokenize_words.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/tokenize_words.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/transformer-decoder-intro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer-decoder-intro.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/transformer_attention_heads_z.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_attention_heads_z.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/transformer_decoder_output_softmax (1).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_decoder_output_softmax (1).png


--------------------------------------------------------------------------------
/03_introlangmodels/images/transformer_decoder_output_softmax.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_decoder_output_softmax.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/transformer_decoding_1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_decoding_1.gif


--------------------------------------------------------------------------------
/03_introlangmodels/images/transformer_decoding_2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_decoding_2.gif


--------------------------------------------------------------------------------
/03_introlangmodels/images/transformer_logits_output_and_label.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_logits_output_and_label.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/transformer_multi-headed_self-attention-recap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_multi-headed_self-attention-recap.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/transformer_positional_encoding_vectors.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_positional_encoding_vectors.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/transformer_self-attention_visualization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_self-attention_visualization.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/transformer_self-attention_visualization_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/transformer_self-attention_visualization_3.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/vision-transformer-vit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/vision-transformer-vit.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/viz-bert-voc-verbs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/viz-bert-voc-verbs.png


--------------------------------------------------------------------------------
/03_introlangmodels/images/wordembedding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_introlangmodels/images/wordembedding.png


--------------------------------------------------------------------------------
/03_profileLearning/.gitignore:
--------------------------------------------------------------------------------
1 | *.log
2 | *.lprof
3 | *.prof
4 | *.h5
5 | *.pb
6 | *.gz
7 | */logdir/*
8 | 


--------------------------------------------------------------------------------
/03_profileLearning/README.md:
--------------------------------------------------------------------------------
 1 | # Profiling TensorFlow
 2 | 
 3 | _Authors_: Kyle Felker (felker@anl.gov), Corey Adams (corey.adams@anl.gov)
 4 | 
 5 | 
 6 | In this example, we'll profile the CNN used to clasify MNIST digits in the previous
 7 | exercises. We will complete several rounds of profiling, each time enabling a new tool or
 8 | optimization.  At the end of the exercise, you'll have a much faster network.
 9 | 
10 | Find the original script in `train_MNIST.py`.
11 | 
12 | All the scripts used here work in a Singularity container containing TensorFlow 2.5.0. 
13 | 
14 | ```bash
15 | singularity exec --nv -B /lus /lus/theta-fs0/software/thetagpu/nvidia-containers/tensorflow2/tf2_21.07-py3.simg bash
16 | ```
17 | 
18 | This walkthrough was adapted from an earlier tutorial from the May 2021 ALCF Computational Performance Workshop, which used a Generative Adversial Network (GAN) for the test code: [CPW21: Profiling TensorFlow](https://github.com/argonne-lcf/CompPerfWorkshop-2021/tree/main/09_profiling_frameworks/TensorFlow).
19 | A separate tutorial for profiling PyTorch codes (using a ResNet model) is also available from that workshop: [CPW21: Profiling PyTorch](https://github.com/argonne-lcf/CompPerfWorkshop-2021/tree/main/09_profiling_frameworks/PyTorchProfiler)
20 | 
21 | 
22 | ## A Starting Point
23 | 
24 | To download the MNIST dataset, make sure to enable HTTP forwarding:
25 | ```bash
26 | export http_proxy=http://theta-proxy.tmi.alcf.anl.gov:3128
27 | export https_proxy=https://theta-proxy.tmi.alcf.anl.gov:3128
28 | ```
29 | 
30 | Run the original script, single node, like so: `python train_MNIST.py`.  Feel free to ctrl+C once it hits a stable throughput.
31 | 
32 | Take note of the throughput reported!
33 | 
34 | ```
35 | 2021-08-02 21:49:36,778 - INFO - (0, 292), Loss: 0.109, step_time: 0.271, throughput: 235.822 img/s.
36 | 2021-08-02 21:49:37,050 - INFO - (0, 293), Loss: 0.129, step_time: 0.271, throughput: 235.804 img/s.
37 | 2021-08-02 21:49:37,321 - INFO - (0, 294), Loss: 0.022, step_time: 0.271, throughput: 236.466 img/s.
38 | 2021-08-02 21:49:37,593 - INFO - (0, 295), Loss: 0.073, step_time: 0.272, throughput: 235.060 img/s.
39 | 2021-08-02 21:49:37,865 - INFO - (0, 296), Loss: 0.026, step_time: 0.271, throughput: 235.941 img/s.
40 | 2021-08-02 21:49:38,136 - INFO - (0, 297), Loss: 0.042, step_time: 0.271, throughput: 236.474 img/s.
41 | 2021-08-02 21:49:38,407 - INFO - (0, 298), Loss: 0.054, step_time: 0.271, throughput: 236.156 img/s.
42 | 2021-08-02 21:49:38,679 - INFO - (0, 299), Loss: 0.132, step_time: 0.272, throughput: 235.603 img/s.
43 | 2021-08-02 21:49:38,951 - INFO - (0, 300), Loss: 0.091, step_time: 0.271, throughput: 235.760 img/s.
44 | 2021-08-02 21:49:39,222 - INFO - (0, 301), Loss: 0.024, step_time: 0.271, throughput: 236.121 img/s.
45 | 2021-08-02 21:49:39,494 - INFO - (0, 302), Loss: 0.229, step_time: 0.271, throughput: 235.878 img/s.
46 | ```
47 | 
48 | On average, the A100 system is moving about 237 Images / second through this training loop.  Let's dig in to the first optimization in the [`line_profiler/`](./line_profiler/) subdirectory.
49 | 
50 | Below are the wrap up conclusions which you can read ahead or come back to later.
51 | 
52 | # Conclusions
53 | 
54 | Try the `optimized` version of the code - what throughput are you getting?  It should be a good deal faster! (1.05 million Img/s - about 4200x faster)  So, after all the profiling, what optimizations did we learn?
55 | 
56 |  - Make sure that IO isn't a bottleneck.  In this case, the fix for this bottleneck was simple.  With big datasets it can be a signficant challenge to keep the GPU fed and not idle on IO.
57 |  - Make sure to use graph compilation where you can.  It's easy to make mistakes here: you must make sure to use only TensorFlow operations!
58 |  - Use XLA, if your training loops have more work in them than a simple MNIST CNN.  It can give excellent speed ups by fusing operations.
59 |  - Use reduced or mixed precision, again if your training loops have more local work than the example here. Reduced precision becomes particularly powerful when XLA is involved, allowing you to keep the Tensor Cores chugging along with less memory-bound operations.
60 | 
61 | In general, if you have an application running in TensoFlow, it's a great idea to profile periodically and make sure you've got all the basic optimizations down!
62 | 
63 | # Comparison to GAN example
64 | 
65 | As mentioned above, a very similar walkthrough based on a Generative Adversial Network (GAN) is available here: [CPW21: Profiling TensorFlow](https://github.com/argonne-lcf/CompPerfWorkshop-2021/tree/main/09_profiling_frameworks/TensorFlow). You are encouraged to compare the results from that tutorial to the lessons learned here. Despite very similar source code, the performance behavior differs from this CNN in some key aspects:
66 | - The optimized GAN only reaches 137k images/second
67 | - XLA is very important to achieving the optimal performance at this throughput range. 
68 | - Mixed precision nets about +40k images/second on the performance over XLA
69 | - The CNN actually has more trainable parameters and is in some sense a "larger" model compared to the GAN, yet is much faster to train. All the extra “GAN-like” operations in the `forward_pass()` function (generate fake images, extra forward pass through the discriminator, label softening, randomly flipping the labels, etc.) cause the 5-10x slowdown relative to a standard feedforward CNN.
70 | 


--------------------------------------------------------------------------------
/03_profileLearning/line_profiler/train_MNIST.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | import argparse
  4 | import logging
  5 | from logging import handlers
  6 | 
  7 | import tensorflow as tf
  8 | import numpy
  9 | 
 10 | import horovod.tensorflow as hvd
 11 | 
 12 | 
 13 | def init_mpi():
 14 |     # Using the presence of an env variable to determine if we're using MPI:
 15 |     try:
 16 |         hvd.init()
 17 |         return hvd.rank(), hvd.size()
 18 |     except:
 19 |         if "mpirun" in sys.argv or "mpiexec" in sys.argv:
 20 |             raise Exception("MPI detected in command line but was not able to init!")
 21 |         return 0, 1
 22 | 
 23 | 
 24 | def configure_logger(rank):
 25 |     '''Configure a global logger
 26 | 
 27 |     Adds a stream handler and a file hander, buffers to file (10 lines) but not to stdout.
 28 | 
 29 |     Submit the MPI Rank
 30 | 
 31 |     '''
 32 |     logger = logging.getLogger()
 33 | 
 34 |     # Create a handler for STDOUT, but only on the root rank.
 35 |     # If not distributed, we still get 0 passed in here.
 36 |     if rank == 0:
 37 |         stream_handler = logging.StreamHandler()
 38 |         formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 39 |         stream_handler.setFormatter(formatter)
 40 |         handler = handlers.MemoryHandler(capacity=0, target=stream_handler)
 41 |         logger.addHandler(handler)
 42 | 
 43 |         # Add a file handler too:
 44 |         log_file = "process.log"
 45 |         file_handler = logging.FileHandler(log_file)
 46 |         file_handler.setFormatter(formatter)
 47 |         file_handler = handlers.MemoryHandler(capacity=10, target=file_handler)
 48 |         logger.addHandler(file_handler)
 49 | 
 50 |         logger.setLevel(logging.INFO)
 51 |     else:
 52 |         # in this case, MPI is available but it's not rank 0
 53 |         # create a null handler
 54 |         handler = logging.NullHandler()
 55 |         logger.addHandler(handler)
 56 |         logger.setLevel(logging.INFO)
 57 | 
 58 | 
 59 | class MNISTClassifier(tf.keras.models.Model):
 60 | 
 61 |     def __init__(self, activation=tf.nn.tanh):
 62 |         tf.keras.models.Model.__init__(self)
 63 | 
 64 |         self.conv_1 = tf.keras.layers.Conv2D(32, [3, 3], activation='relu')
 65 |         self.conv_2 = tf.keras.layers.Conv2D(64, [3, 3], activation='relu')
 66 |         self.pool_3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))
 67 |         self.drop_4 = tf.keras.layers.Dropout(0.25)
 68 |         self.dense_5 = tf.keras.layers.Dense(128, activation='relu')
 69 |         self.drop_6 = tf.keras.layers.Dropout(0.5)
 70 |         self.dense_7 = tf.keras.layers.Dense(10, activation='softmax')
 71 | 
 72 |     def call(self, inputs):
 73 |         '''
 74 |         Reshape at input and output:
 75 |         '''
 76 |         # batch_size = inputs.shape[0]
 77 | 
 78 |         x = self.conv_1(inputs)
 79 |         x = self.conv_2(x)
 80 |         x = self.pool_3(x)
 81 |         x = self.drop_4(x)
 82 |         x = tf.keras.layers.Flatten()(x)
 83 |         x = self.dense_5(x)
 84 |         x = self.drop_6(x)
 85 |         x = self.dense_7(x)
 86 | 
 87 |         return x
 88 | 
 89 | 
 90 | def compute_loss(y_true, y_pred):
 91 |     # if labels are integers, use sparse categorical crossentropy
 92 |     # network's final layer is softmax, so from_logtis=False
 93 |     scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
 94 |     # if labels are one-hot encoded, use standard crossentropy
 95 | 
 96 |     return scce(y_true, y_pred)  # .numpy()
 97 | 
 98 | 
 99 | def get_dataset():
100 | 
101 |     # Read in the mnist data so we have it loaded globally:
102 |     (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
103 |     x_train = x_train.astype(numpy.float32)
104 |     x_test  = x_test.astype(numpy.float32)
105 | 
106 |     x_train /= 255.
107 |     x_test  /= 255.
108 | 
109 |     y_train = y_train.astype(numpy.int32)
110 |     y_test  = y_test.astype(numpy.int32)
111 | 
112 |     return x_train, x_test, y_train, y_test
113 | 
114 | 
115 | def fetch_batch(_batch_size):
116 |     x_train, x_test, y_train, y_test = get_dataset()
117 | 
118 |     indexes = numpy.random.choice(a=x_train.shape[0], size=[_batch_size,])
119 | 
120 |     images = x_train[indexes].reshape(_batch_size, 28, 28, 1)
121 |     labels = y_train[indexes].reshape(_batch_size, 1)
122 | 
123 |     return images, labels
124 | 
125 | 
126 | @profile
127 | def forward_pass(model, batch_size):
128 |     batch_data, y_true = fetch_batch(batch_size)
129 |     y_pred = model(batch_data)
130 |     loss = compute_loss(y_true, y_pred)
131 |     return loss
132 | 
133 | 
134 | @profile
135 | def train_loop(batch_size, n_training_epochs, model, opt, global_size):
136 | 
137 |     logger = logging.getLogger()
138 | 
139 |     rank = hvd.rank()
140 |     for i_epoch in range(n_training_epochs):
141 | 
142 |         epoch_steps = int(60000/batch_size)
143 | 
144 |         for i_batch in range(epoch_steps):
145 | 
146 |             start = time.time()
147 | 
148 |             with tf.GradientTape() as tape:
149 |                 loss = forward_pass(model, batch_size)
150 | 
151 |             if global_size != 1:
152 |                 tape = hvd.DistributedGradientTape(tape)
153 | 
154 |             trainable_vars = model.trainable_variables
155 | 
156 |             # Apply the update to the network (one at a time):
157 |             grads = tape.gradient(loss, trainable_vars)
158 | 
159 |             opt.apply_gradients(zip(grads, trainable_vars))
160 | 
161 |             end = time.time()
162 | 
163 |             images = batch_size*global_size
164 | 
165 |             logger.info(f"({i_epoch}, {i_batch}), Loss: {loss:.3f}, step_time: {end-start :.3f}, throughput: {images/(end-start):.3f} img/s.")
166 | 
167 | 
168 | @profile
169 | def train_network(_batch_size, _training_iterations, _lr, global_size):
170 | 
171 |     mnist_model = MNISTClassifier()
172 | 
173 |     opt = tf.keras.optimizers.Adam(_lr)
174 | 
175 |     if global_size != 1:
176 |         hvd.broadcast_variables(mnist_model.variables, root_rank=0)
177 |         hvd.broadcast_variables(opt.variables(), root_rank=0)
178 | 
179 |     train_loop(_batch_size, _training_iterations, mnist_model, opt, global_size)
180 | 
181 | 
182 | if __name__ == '__main__':
183 | 
184 |     rank, size = init_mpi()
185 |     configure_logger(rank)
186 | 
187 |     parser = argparse.ArgumentParser(description='TensorFlow MNIST Example')
188 |     parser.add_argument('--batch_size', type=int, default=64, metavar='N',
189 |                         help='input batch size for training (default: 64)')
190 |     parser.add_argument('--epochs', type=int, default=10, metavar='N',
191 |                         help='number of epochs to train (default: 10)')
192 |     parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
193 |                         help='learning rate (default: 0.01)')
194 |     # parser.add_argument('--device', default='cpu',
195 |     #                     help='Wheter this is running on cpu or gpu')
196 |     # parser.add_argument('--num_inter', default=2, help='set number inter', type=int)
197 |     # parser.add_argument('--num_intra', default=0, help='set number intra', type=int)
198 |     # parser.add_argument('--warmup_epochs', default=3, help='number of warmup epochs',
199 |     # type=int)
200 | 
201 |     args = parser.parse_args()
202 |     scaled_lr = args.lr * hvd.size()
203 |     train_network(args.batch_size, args.epochs, scaled_lr, size)
204 | 


--------------------------------------------------------------------------------
/03_profileLearning/reduced_precision/README.md:
--------------------------------------------------------------------------------
 1 | # Reduced and Mixed Precision
 2 | 
 3 | *Note*: for a more extensive discussion and tutorial related to mixed precision training of neural networks, refer to [CPW21: Reduced and Mixed Precision](https://github.com/argonne-lcf/CompPerfWorkshop-2021/tree/main/10_reduced-precision).
 4 | 
 5 | Switching to reduced or mixed precision is not that hard in TensorFlow's Keras API:
 6 | ```python
 7 | tf.keras.mixed_precision.set_global_policy("mixed_float16")
 8 | ```
 9 | 
10 | In this case, there are also a bunch of places where I hard-coded `float32` - oops!  Fix those too. The final layer (softmax) should **not** used mixed precision.
11 | 
12 | In general, loss scaling should be used to prevent numerical underflow (and sometimes overflow) when using `float16`. See [the TensorFlow guide to Mixed Precision](https://www.tensorflow.org/guide/mixed_precision#training_the_model_with_a_custom_training_loop) for more details. 
13 | 
14 | Now run the code with mixed precision and the default Keras loss scale optimizer (both XLA and profiling are disabled here):
15 | ```
16 | python train_MNIST_tf_function_XLA_mixed.py --epochs 1 --batch_size 1024
17 | 
18 | ...
19 | 2021-08-10 02:35:00,466 - INFO - (0, 46), Loss: 0.11277, step_time: 0.00532, throughput: 1.93e+05 img/s.
20 | 2021-08-10 02:35:00,473 - INFO - (0, 47), Loss: 0.08135, step_time: 0.00533, throughput: 1.92e+05 img/s.
21 | 2021-08-10 02:35:00,485 - INFO - (0, 48), Loss: 0.10929, step_time: 0.01039, throughput: 9.85e+04 img/s.
22 | 2021-08-10 02:35:00,493 - INFO - (0, 49), Loss: 0.10958, step_time: 0.00538, throughput: 1.90e+05 img/s.
23 | 2021-08-10 02:35:00,500 - INFO - (0, 50), Loss: 0.09276, step_time: 0.00539, throughput: 1.90e+05 img/s.
24 | 2021-08-10 02:35:00,508 - INFO - (0, 51), Loss: 0.09720, step_time: 0.00543, throughput: 1.88e+05 img/s.
25 | 2021-08-10 02:35:00,515 - INFO - (0, 52), Loss: 0.07887, step_time: 0.00538, throughput: 1.90e+05 img/s.
26 | 2021-08-10 02:35:00,523 - INFO - (0, 53), Loss: 0.06713, step_time: 0.00526, throughput: 1.95e+05 img/s.
27 | ```
28 | The performance has gotten significantly worse (5x slower than before). Quick investigation reveals that the _dynamic_ loss scaling optimizer is responsible for the degradation. Typically, such default, automated loss scaling is relatively inexpensive and should be used. However, with such a simple network architecture and training loop, the local work is so small and the image throughput is so high, that even this small extra cost can harm your peak efficiency.
29 | 
30 | Comment out the default `LossScaleOptimizer` and use the one with `dyanmic=False`:
31 | ```
32 |     # Dynamic loss scaling (more expensive, but more reliable)
33 |     #opt = tf.keras.mixed_precision.LossScaleOptimizer(tf.keras.optimizers.Adam(_lr))
34 |     # Fixed loss scaling (cheap)
35 |     opt = tf.keras.mixed_precision.LossScaleOptimizer(
36 |         tf.keras.optimizers.Adam(_lr),
37 |         dynamic=False,
38 |         initial_scale=1024,
39 |     )
40 | ``` 
41 | If you use too large of an `initial_scale`, the loss function will return NaNs. Using a fixed scale of 1024 seems fairly robust for our case:
42 | ```
43 | 2021-08-10 02:30:46,418 - INFO - (0, 48), Loss: 0.08706, step_time: 0.00100, throughput: 1.02e+06 img/s.
44 | 2021-08-10 02:30:46,425 - INFO - (0, 49), Loss: 0.10371, step_time: 0.00101, throughput: 1.02e+06 img/s.
45 | 2021-08-10 02:30:46,432 - INFO - (0, 50), Loss: 0.09754, step_time: 0.00100, throughput: 1.03e+06 img/s.
46 | 2021-08-10 02:30:46,438 - INFO - (0, 51), Loss: 0.09409, step_time: 0.00099, throughput: 1.03e+06 img/s.
47 | 2021-08-10 02:30:46,445 - INFO - (0, 52), Loss: 0.06493, step_time: 0.00101, throughput: 1.01e+06 img/s.
48 | 2021-08-10 02:30:46,451 - INFO - (0, 53), Loss: 0.05253, step_time: 0.00100, throughput: 1.03e+06 img/s.
49 | 2021-08-10 02:30:46,458 - INFO - (0, 54), Loss: 0.07965, step_time: 0.00100, throughput: 1.02e+06 img/s.
50 | 2021-08-10 02:30:46,465 - INFO - (0, 55), Loss: 0.06847, step_time: 0.00100, throughput: 1.02e+06 img/s.
51 | ```
52 | 
53 | This is disappointing!  Just like with XLA, we ran with mixed precision and it is slightly SLOWER.  Let's look into the profile to discover why.
54 | 
55 | Here's the overview page.  We note right away that in the bottom left, it IS using a good amount of reduced precision.
56 | 
57 | ![Tensorboard Profiler Overview](./images/profiler_overview.png)
58 | 
59 | Scrolling down:
60 | 
61 | ![top 10](./images/top10-ops.png)
62 | 
63 | Compared to the float32 top-10 operations, this is pretty different.  There is one op that is particularly dominant!  It is still a conv2d operation, but for some reason it is much slower than the others.
64 | 
65 | Here is the Kernel Statistics page again:
66 | 
67 | ![kernel stats](./images/kernel-stats.png)
68 | 
69 | We see the same problem there - except this time it's pointing to the wgrad (weight gradient) calculation of the second Conv2D layer. The TensorFlow Statistics shows similar info:
70 | 
71 | ![tf stats](./images/tf-stats.png)
72 | 
73 | And there is also a timeline view of all ops (trace viewer)
74 | 
75 | ![timeline](./images/trace-viewer.png)
76 | 
77 | And zoomed:
78 | 
79 | ![timeline zoom](./images/trace-zoomed.png)
80 | 
81 | Compare this to the trace viewer for the `float32` case:
82 | 
83 | ![timelline zoom fp32](./images/trace-zoomed-float32.png)
84 | 
85 | So, reduced precision appears to be slower because of a marginally larger kernel launch
86 | time for the `float16` kernels vs. the `float32` versions of the kernels (which are
87 | **not** using the TensorCores of the A100). This especially affects the backpropagation
88 | weight gradient calculation of the second Conv2D layer, as seen above.
89 | 


--------------------------------------------------------------------------------
/03_profileLearning/reduced_precision/images/kernel-stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/reduced_precision/images/kernel-stats.png


--------------------------------------------------------------------------------
/03_profileLearning/reduced_precision/images/profiler_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/reduced_precision/images/profiler_overview.png


--------------------------------------------------------------------------------
/03_profileLearning/reduced_precision/images/tf-stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/reduced_precision/images/tf-stats.png


--------------------------------------------------------------------------------
/03_profileLearning/reduced_precision/images/top10-ops.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/reduced_precision/images/top10-ops.png


--------------------------------------------------------------------------------
/03_profileLearning/reduced_precision/images/trace-viewer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/reduced_precision/images/trace-viewer.png


--------------------------------------------------------------------------------
/03_profileLearning/reduced_precision/images/trace-zoomed-float32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/reduced_precision/images/trace-zoomed-float32.png


--------------------------------------------------------------------------------
/03_profileLearning/reduced_precision/images/trace-zoomed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/reduced_precision/images/trace-zoomed.png


--------------------------------------------------------------------------------
/03_profileLearning/tf_function/README.md:
--------------------------------------------------------------------------------
 1 | # TF Function and Graph Compilation
 2 | 
 3 | Using line profiler showed us that the largest computation use, by far, was the train loop and is subcalls.  Here, we'll wrap those functions in `@tf.function` decorators to improve performance with graph compilation. Additionally, some of the operations have to be re-written to stay entirely within the TensorFlow library and not use NumPy calls.
 4 | 
 5 | This has a mild overhead at the start for tracing but then runs more quickly.  Observe how the first few iterations are slower:
 6 | ```
 7 | python train_MNIST_tf_function.py --epochs 1
 8 | ```
 9 | Out of the box, we see a dramatic speed up:
10 | ```
11 | 2021-08-10 01:46:46,371 - INFO - (0, 923), Loss: 0.001, step_time: 0.001, throughput: 74358.852 img/s.
12 | 2021-08-10 01:46:46,372 - INFO - (0, 924), Loss: 0.000, step_time: 0.001, throughput: 74030.738 img/s.
13 | 2021-08-10 01:46:46,374 - INFO - (0, 925), Loss: 0.000, step_time: 0.001, throughput: 66296.729 img/s.
14 | 2021-08-10 01:46:46,375 - INFO - (0, 926), Loss: 0.123, step_time: 0.001, throughput: 74379.456 img/s.
15 | 2021-08-10 01:46:46,377 - INFO - (0, 927), Loss: 0.035, step_time: 0.001, throughput: 67480.004 img/s.
16 | 2021-08-10 01:46:46,378 - INFO - (0, 928), Loss: 0.040, step_time: 0.001, throughput: 73847.443 img/s.
17 | 2021-08-10 01:46:46,380 - INFO - (0, 929), Loss: 0.017, step_time: 0.001, throughput: 73564.115 img/s.
18 | 2021-08-10 01:46:46,381 - INFO - (0, 930), Loss: 0.000, step_time: 0.001, throughput: 71335.492 img/s.
19 | 2021-08-10 01:46:46,383 - INFO - (0, 931), Loss: 0.007, step_time: 0.001, throughput: 71985.909 img/s.
20 | 2021-08-10 01:46:46,384 - INFO - (0, 932), Loss: 0.268, step_time: 0.001, throughput: 72354.570 img/s.
21 | 2021-08-10 01:46:46,385 - INFO - (0, 933), Loss: 0.443, step_time: 0.001, throughput: 73282.953 img/s.
22 | 2021-08-10 01:46:46,387 - INFO - (0, 934), Loss: 0.047, step_time: 0.001, throughput: 74214.945 img/s.
23 | 2021-08-10 01:46:46,388 - INFO - (0, 935), Loss: 0.011, step_time: 0.001, throughput: 74153.441 img/s.
24 | ```
25 | 
26 | Instead of 230 Img/s, we're at 74,000 Img/s.  But note that the step time is now only 0.001 seconds.  We're likely not doing enough work per batch.  Increase the batch size from 64 to 1024 or 2048:
27 | ```
28 | python train_MNIST_tf_function.py --epochs 1 --batch_size 1024
29 | ```
30 | Which yields:
31 | ```
32 | 2021-08-10 01:44:13,671 - INFO - (0, 29), Loss: 0.158, step_time: 0.001, throughput: 1099582.001 img/s.
33 | 2021-08-10 01:44:13,679 - INFO - (0, 30), Loss: 0.202, step_time: 0.001, throughput: 1085958.861 img/s.
34 | 2021-08-10 01:44:13,686 - INFO - (0, 31), Loss: 0.172, step_time: 0.001, throughput: 1093980.463 img/s.
35 | 2021-08-10 01:44:13,694 - INFO - (0, 32), Loss: 0.140, step_time: 0.001, throughput: 1093423.446 img/s.
36 | 2021-08-10 01:44:13,701 - INFO - (0, 33), Loss: 0.117, step_time: 0.001, throughput: 1087608.837 img/s.
37 | 2021-08-10 01:44:13,709 - INFO - (0, 34), Loss: 0.120, step_time: 0.001, throughput: 1074278.963 img/s.
38 | 2021-08-10 01:44:13,716 - INFO - (0, 35), Loss: 0.140, step_time: 0.001, throughput: 1052172.292 img/s.
39 | 2021-08-10 01:44:13,724 - INFO - (0, 36), Loss: 0.184, step_time: 0.001, throughput: 1062584.685 img/s.
40 | 2021-08-10 01:44:13,731 - INFO - (0, 37), Loss: 0.108, step_time: 0.001, throughput: 1053721.123 img/s.
41 | 2021-08-10 01:44:13,739 - INFO - (0, 38), Loss: 0.112, step_time: 0.001, throughput: 1064691.942 img/s
42 | ```
43 | 
44 | We also can enable [XLA Fusion](https://www.tensorflow.org/xla) (for GPU or CPU) to speed up the computations by fusing small ops together.  
45 | 
46 | XLA can be enabled (for code within `tf.function` sections) without any changes to your code by setting an environment variable on the command line:
47 | `TF_XLA_FLAGS=--tf_xla_auto_jit=2 python train_MNIST_tf_function.py --epochs 1 --batch_size 1024`
48 | This examines your `tf.function()` decorated functions and automatically decides if/how to cluster them in XLA operations.
49 | > A simple way to start using XLA in TensorFlow models without any changes is to enable auto-clustering, which automatically finds clusters (connected subgraphs) within the TensorFlow functions which can be compiled and executed using XLA. 
50 | 
51 | Or, XLA can be manually enabled by explicitly setting `@tf.function(jit_compile=True)` for the desired functions. This option was formerly called `experimental_compile` (before TensorFlow v2.5.0). Try:
52 | ```
53 | python train_MNIST_tf_function_XLA.py --epochs 1 --batch_size 1024
54 | ```
55 | 
56 | In this case, XLA actually **harms performance** as our training throughput drops to 650k images/second:
57 | ```
58 | 2021-08-10 01:55:04,462 - INFO - (0, 10), Loss: 0.381, step_time: 0.002, throughput: 643248.060 img/s.
59 | 2021-08-10 01:55:04,470 - INFO - (0, 11), Loss: 0.500, step_time: 0.002, throughput: 646929.853 img/s.
60 | 2021-08-10 01:55:04,478 - INFO - (0, 12), Loss: 0.506, step_time: 0.002, throughput: 652929.051 img/s.
61 | 2021-08-10 01:55:04,486 - INFO - (0, 13), Loss: 0.371, step_time: 0.002, throughput: 646345.718 img/s.
62 | 2021-08-10 01:55:04,494 - INFO - (0, 14), Loss: 0.405, step_time: 0.002, throughput: 650752.621 img/s.
63 | 2021-08-10 01:55:04,502 - INFO - (0, 15), Loss: 0.296, step_time: 0.002, throughput: 654022.734 img/s.
64 | 2021-08-10 01:55:04,510 - INFO - (0, 16), Loss: 0.255, step_time: 0.002, throughput: 651147.255 img/s.
65 | 2021-08-10 01:55:04,518 - INFO - (0, 17), Loss: 0.250, step_time: 0.002, throughput: 648982.668 img/s.
66 | 2021-08-10 01:55:04,526 - INFO - (0, 18), Loss: 0.195, step_time: 0.002, throughput: 657124.739 img/s.
67 | 2021-08-10 01:55:04,534 - INFO - (0, 19), Loss: 0.213, step_time: 0.002, throughput: 653525.152 img/s.
68 | ```
69 | 
70 | As an aside, using a slightly older vesion of TensorFlow in the July 2021 NGC image release leads to a seg-fault when XLA is used on any of the `tf.function` sections other than `compute_loss()`. The generated XLA programs can be dumped with:
71 | ```
72 | TF_DUMP_GRAPH_PREFIX=/tmp/generated TF_XLA_FLAGS="--tf_xla_clustering_debug --tf_xla_auto_jit=2" XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=/tmp/generated" python train_MNIST_tf_function.py --epochs 1 --batch_size 1024
73 | ```
74 | and are kept for reference in [`xla_bug_generated/`](./xla_bug_generated/). Feel free to try it yourself (and maybe open a bug report in the TensorFlow repository!): 
75 | `/lus/theta-fs0/software/thetagpu/nvidia-containers/tensorflow2/tf2_21.04-py3.simg`
76 | 
77 | Beyond this, we'll have to run the TensorFlow Profiler.  That is in the next folder, [`tf_profiler/`](../tf_profiler/).
78 | 


--------------------------------------------------------------------------------
/03_profileLearning/tf_function/train_MNIST_tf_function.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | import argparse
  4 | import logging
  5 | from logging import handlers
  6 | 
  7 | import tensorflow as tf
  8 | import numpy
  9 | 
 10 | import horovod.tensorflow as hvd
 11 | 
 12 | 
 13 | # Read in the mnist data so we have it loaded globally:
 14 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
 15 | x_train = x_train.astype(numpy.float32)
 16 | x_test  = x_test.astype(numpy.float32)
 17 | 
 18 | x_train /= 255.
 19 | x_test  /= 255.
 20 | 
 21 | y_train = y_train.astype(numpy.int32)
 22 | y_test  = y_test.astype(numpy.int32)
 23 | 
 24 | dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
 25 | dataset.shuffle(60000)
 26 | 
 27 | 
 28 | def init_mpi():
 29 |     # Using the presence of an env variable to determine if we're using MPI:
 30 |     try:
 31 |         hvd.init()
 32 |         return hvd.rank(), hvd.size()
 33 |     except:
 34 |         if "mpirun" in sys.argv or "mpiexec" in sys.argv:
 35 |             raise Exception("MPI detected in command line but was not able to init!")
 36 |         return 0, 1
 37 | 
 38 | 
 39 | def configure_logger(rank):
 40 |     '''Configure a global logger
 41 | 
 42 |     Adds a stream handler and a file hander, buffers to file (10 lines) but not to stdout.
 43 | 
 44 |     Submit the MPI Rank
 45 | 
 46 |     '''
 47 |     logger = logging.getLogger()
 48 | 
 49 |     # Create a handler for STDOUT, but only on the root rank.
 50 |     # If not distributed, we still get 0 passed in here.
 51 |     if rank == 0:
 52 |         stream_handler = logging.StreamHandler()
 53 |         formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 54 |         stream_handler.setFormatter(formatter)
 55 |         handler = handlers.MemoryHandler(capacity=0, target=stream_handler)
 56 |         logger.addHandler(handler)
 57 | 
 58 |         # Add a file handler too:
 59 |         log_file = "process.log"
 60 |         file_handler = logging.FileHandler(log_file)
 61 |         file_handler.setFormatter(formatter)
 62 |         file_handler = handlers.MemoryHandler(capacity=10, target=file_handler)
 63 |         logger.addHandler(file_handler)
 64 | 
 65 |         logger.setLevel(logging.INFO)
 66 |     else:
 67 |         # in this case, MPI is available but it's not rank 0
 68 |         # create a null handler
 69 |         handler = logging.NullHandler()
 70 |         logger.addHandler(handler)
 71 |         logger.setLevel(logging.INFO)
 72 | 
 73 | 
 74 | class MNISTClassifier(tf.keras.models.Model):
 75 | 
 76 |     def __init__(self, activation=tf.nn.tanh):
 77 |         tf.keras.models.Model.__init__(self)
 78 | 
 79 |         self.conv_1 = tf.keras.layers.Conv2D(32, [3, 3], activation='relu')
 80 |         self.conv_2 = tf.keras.layers.Conv2D(64, [3, 3], activation='relu')
 81 |         self.pool_3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))
 82 |         self.drop_4 = tf.keras.layers.Dropout(0.25)
 83 |         self.dense_5 = tf.keras.layers.Dense(128, activation='relu')
 84 |         self.drop_6 = tf.keras.layers.Dropout(0.5)
 85 |         self.dense_7 = tf.keras.layers.Dense(10, activation='softmax')
 86 | 
 87 |     #@tf.function
 88 |     def call(self, inputs):
 89 |         '''
 90 |         Reshape at input and output:
 91 |         '''
 92 |         # batch_size = inputs.shape[0]
 93 | 
 94 |         x = self.conv_1(inputs)
 95 |         x = self.conv_2(x)
 96 |         x = self.pool_3(x)
 97 |         x = self.drop_4(x)
 98 |         x = tf.keras.layers.Flatten()(x)
 99 |         x = self.dense_5(x)
100 |         x = self.drop_6(x)
101 |         x = self.dense_7(x)
102 | 
103 |         return x
104 | 
105 | 
106 | #@tf.function
107 | def compute_loss(y_true, y_pred):
108 |     # if labels are integers, use sparse categorical crossentropy
109 |     # network's final layer is softmax, so from_logtis=False
110 |     scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
111 |     # if labels are one-hot encoded, use standard crossentropy
112 | 
113 |     return scce(y_true, y_pred)  # .numpy()
114 | 
115 | 
116 | #@tf.function
117 | def forward_pass(model, batch_data, y_true):
118 |     y_pred = model(batch_data)
119 |     loss = compute_loss(y_true, y_pred)
120 |     return loss
121 | 
122 | 
123 | def train_loop(batch_size, n_training_epochs, model, opt, global_size):
124 | 
125 |     @tf.function
126 |     def train_iteration(data, y_true, model, opt, global_size):
127 |         with tf.GradientTape() as tape:
128 |             loss = forward_pass(model, data, y_true)
129 | 
130 |         if global_size != 1:
131 |             tape = hvd.DistributedGradientTape(tape)
132 | 
133 |         trainable_vars = model.trainable_variables
134 | 
135 |         # Apply the update to the network (one at a time):
136 |         grads = tape.gradient(loss, trainable_vars)
137 | 
138 |         opt.apply_gradients(zip(grads, trainable_vars))
139 |         return loss
140 | 
141 | 
142 |     logger = logging.getLogger()
143 | 
144 |     rank = hvd.rank()
145 |     for i_epoch in range(n_training_epochs):
146 | 
147 |         epoch_steps = int(60000/batch_size)
148 |         dataset.shuffle(60000) # Shuffle the whole dataset in memory
149 |         batches = dataset.batch(batch_size=batch_size, drop_remainder=True)
150 | 
151 |         for i_batch, (batch_data, y_true) in enumerate(batches):
152 | 
153 |             batch_data = tf.reshape(batch_data, [-1, 28, 28, 1])
154 | 
155 |             start = time.time()
156 | 
157 |             loss = train_iteration(batch_data, y_true, model, opt, global_size)
158 | 
159 |             end = time.time()
160 | 
161 |             images = batch_size*global_size
162 | 
163 |             logger.info(f"({i_epoch}, {i_batch}), Loss: {loss:.3f}, step_time: {end-start :.3f}, throughput: {images/(end-start):.3f} img/s.")
164 | 
165 | 
166 | def train_network(_batch_size, _training_iterations, _lr, global_size):
167 | 
168 |     mnist_model = MNISTClassifier()
169 | 
170 |     opt = tf.keras.optimizers.Adam(_lr)
171 | 
172 |     if global_size != 1:
173 |         hvd.broadcast_variables(mnist_model.variables, root_rank=0)
174 |         hvd.broadcast_variables(opt.variables(), root_rank=0)
175 | 
176 |     train_loop(_batch_size, _training_iterations, mnist_model, opt, global_size)
177 | 
178 | 
179 | if __name__ == '__main__':
180 | 
181 |     rank, size = init_mpi()
182 |     configure_logger(rank)
183 | 
184 |     parser = argparse.ArgumentParser(description='TensorFlow MNIST Example')
185 |     parser.add_argument('--batch_size', type=int, default=64, metavar='N',
186 |                         help='input batch size for training (default: 64)')
187 |     parser.add_argument('--epochs', type=int, default=10, metavar='N',
188 |                         help='number of epochs to train (default: 10)')
189 |     parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
190 |                         help='learning rate (default: 0.01)')
191 |     # parser.add_argument('--device', default='cpu',
192 |     #                     help='Wheter this is running on cpu or gpu')
193 |     # parser.add_argument('--num_inter', default=2, help='set number inter', type=int)
194 |     # parser.add_argument('--num_intra', default=0, help='set number intra', type=int)
195 |     # parser.add_argument('--warmup_epochs', default=3, help='number of warmup epochs',
196 |     # type=int)
197 | 
198 |     args = parser.parse_args()
199 |     scaled_lr = args.lr * hvd.size()
200 |     train_network(args.batch_size, args.epochs, scaled_lr, size)
201 | 


--------------------------------------------------------------------------------
/03_profileLearning/tf_function/train_MNIST_tf_function_XLA.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | import argparse
  4 | import logging
  5 | from logging import handlers
  6 | 
  7 | import tensorflow as tf
  8 | import numpy
  9 | 
 10 | import horovod.tensorflow as hvd
 11 | 
 12 | 
 13 | # Read in the mnist data so we have it loaded globally:
 14 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
 15 | x_train = x_train.astype(numpy.float32)
 16 | x_test  = x_test.astype(numpy.float32)
 17 | 
 18 | x_train /= 255.
 19 | x_test  /= 255.
 20 | 
 21 | y_train = y_train.astype(numpy.int32)
 22 | y_test  = y_test.astype(numpy.int32)
 23 | 
 24 | dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
 25 | dataset.shuffle(60000)
 26 | 
 27 | 
 28 | def init_mpi():
 29 |     # Using the presence of an env variable to determine if we're using MPI:
 30 |     try:
 31 |         hvd.init()
 32 |         return hvd.rank(), hvd.size()
 33 |     except:
 34 |         if "mpirun" in sys.argv or "mpiexec" in sys.argv:
 35 |             raise Exception("MPI detected in command line but was not able to init!")
 36 |         return 0, 1
 37 | 
 38 | 
 39 | def configure_logger(rank):
 40 |     '''Configure a global logger
 41 | 
 42 |     Adds a stream handler and a file hander, buffers to file (10 lines) but not to stdout.
 43 | 
 44 |     Submit the MPI Rank
 45 | 
 46 |     '''
 47 |     logger = logging.getLogger()
 48 | 
 49 |     # Create a handler for STDOUT, but only on the root rank.
 50 |     # If not distributed, we still get 0 passed in here.
 51 |     if rank == 0:
 52 |         stream_handler = logging.StreamHandler()
 53 |         formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 54 |         stream_handler.setFormatter(formatter)
 55 |         handler = handlers.MemoryHandler(capacity=0, target=stream_handler)
 56 |         logger.addHandler(handler)
 57 | 
 58 |         # Add a file handler too:
 59 |         log_file = "process.log"
 60 |         file_handler = logging.FileHandler(log_file)
 61 |         file_handler.setFormatter(formatter)
 62 |         file_handler = handlers.MemoryHandler(capacity=10, target=file_handler)
 63 |         logger.addHandler(file_handler)
 64 | 
 65 |         logger.setLevel(logging.INFO)
 66 |     else:
 67 |         # in this case, MPI is available but it's not rank 0
 68 |         # create a null handler
 69 |         handler = logging.NullHandler()
 70 |         logger.addHandler(handler)
 71 |         logger.setLevel(logging.INFO)
 72 | 
 73 | 
 74 | class MNISTClassifier(tf.keras.models.Model):
 75 | 
 76 |     def __init__(self, activation=tf.nn.tanh):
 77 |         tf.keras.models.Model.__init__(self)
 78 | 
 79 |         self.conv_1 = tf.keras.layers.Conv2D(32, [3, 3], activation='relu')
 80 |         self.conv_2 = tf.keras.layers.Conv2D(64, [3, 3], activation='relu')
 81 |         self.pool_3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))
 82 |         self.drop_4 = tf.keras.layers.Dropout(0.25)
 83 |         self.dense_5 = tf.keras.layers.Dense(128, activation='relu')
 84 |         self.drop_6 = tf.keras.layers.Dropout(0.5)
 85 |         self.dense_7 = tf.keras.layers.Dense(10, activation='softmax')
 86 | 
 87 |     #@tf.function(jit_compile=True)
 88 |     def call(self, inputs):
 89 |         '''
 90 |         Reshape at input and output:
 91 |         '''
 92 |         # batch_size = inputs.shape[0]
 93 | 
 94 |         x = self.conv_1(inputs)
 95 |         x = self.conv_2(x)
 96 |         x = self.pool_3(x)
 97 |         x = self.drop_4(x)
 98 |         x = tf.keras.layers.Flatten()(x)
 99 |         x = self.dense_5(x)
100 |         x = self.drop_6(x)
101 |         x = self.dense_7(x)
102 | 
103 |         return x
104 | 
105 | 
106 | #@tf.function(jit_compile=True)
107 | def compute_loss(y_true, y_pred):
108 |     # if labels are integers, use sparse categorical crossentropy
109 |     # network's final layer is softmax, so from_logtis=False
110 |     scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
111 |     # if labels are one-hot encoded, use standard crossentropy
112 | 
113 |     return scce(y_true, y_pred)  # .numpy()
114 | 
115 | 
116 | #@tf.function(jit_compile=True)
117 | def forward_pass(model, batch_data, y_true):
118 |     y_pred = model(batch_data)
119 |     loss = compute_loss(y_true, y_pred)
120 |     return loss
121 | 
122 | 
123 | def train_loop(batch_size, n_training_epochs, model, opt, global_size):
124 | 
125 |     @tf.function(jit_compile=True)
126 |     def train_iteration(data, y_true, model, opt, global_size):
127 |         with tf.GradientTape() as tape:
128 |             loss = forward_pass(model, data, y_true)
129 | 
130 |         if global_size != 1:
131 |             tape = hvd.DistributedGradientTape(tape)
132 | 
133 |         trainable_vars = model.trainable_variables
134 | 
135 |         # Apply the update to the network (one at a time):
136 |         grads = tape.gradient(loss, trainable_vars)
137 | 
138 |         opt.apply_gradients(zip(grads, trainable_vars))
139 |         return loss
140 | 
141 | 
142 |     logger = logging.getLogger()
143 | 
144 |     rank = hvd.rank()
145 |     for i_epoch in range(n_training_epochs):
146 | 
147 |         epoch_steps = int(60000/batch_size)
148 |         dataset.shuffle(60000) # Shuffle the whole dataset in memory
149 |         batches = dataset.batch(batch_size=batch_size, drop_remainder=True)
150 | 
151 |         for i_batch, (batch_data, y_true) in enumerate(batches):
152 | 
153 |             batch_data = tf.reshape(batch_data, [-1, 28, 28, 1])
154 | 
155 |             start = time.time()
156 | 
157 |             loss = train_iteration(batch_data, y_true, model, opt, global_size)
158 | 
159 |             end = time.time()
160 | 
161 |             images = batch_size*global_size
162 | 
163 |             logger.info(f"({i_epoch}, {i_batch}), Loss: {loss:.3f}, step_time: {end-start :.3f}, throughput: {images/(end-start):.3f} img/s.")
164 | 
165 | 
166 | def train_network(_batch_size, _training_iterations, _lr, global_size):
167 | 
168 |     mnist_model = MNISTClassifier()
169 | 
170 |     opt = tf.keras.optimizers.Adam(_lr)
171 | 
172 |     if global_size != 1:
173 |         hvd.broadcast_variables(mnist_model.variables, root_rank=0)
174 |         hvd.broadcast_variables(opt.variables(), root_rank=0)
175 | 
176 |     train_loop(_batch_size, _training_iterations, mnist_model, opt, global_size)
177 | 
178 | 
179 | if __name__ == '__main__':
180 | 
181 |     rank, size = init_mpi()
182 |     configure_logger(rank)
183 | 
184 |     parser = argparse.ArgumentParser(description='TensorFlow MNIST Example')
185 |     parser.add_argument('--batch_size', type=int, default=64, metavar='N',
186 |                         help='input batch size for training (default: 64)')
187 |     parser.add_argument('--epochs', type=int, default=10, metavar='N',
188 |                         help='number of epochs to train (default: 10)')
189 |     parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
190 |                         help='learning rate (default: 0.01)')
191 |     # parser.add_argument('--device', default='cpu',
192 |     #                     help='Wheter this is running on cpu or gpu')
193 |     # parser.add_argument('--num_inter', default=2, help='set number inter', type=int)
194 |     # parser.add_argument('--num_intra', default=0, help='set number intra', type=int)
195 |     # parser.add_argument('--warmup_epochs', default=3, help='number of warmup epochs',
196 |     # type=int)
197 | 
198 |     args = parser.parse_args()
199 |     scaled_lr = args.lr * hvd.size()
200 |     train_network(args.batch_size, args.epochs, scaled_lr, size)
201 | 


--------------------------------------------------------------------------------
/03_profileLearning/tf_profiler/README.md:
--------------------------------------------------------------------------------
 1 | # TF Profiling Tool
 2 | 
 3 | 
 4 | Note the lines `tf.profiler.experimental.start('logdir')` and `tf.profiler.experimental.stop()` in the code.  This sets up and tears down the profiling tool built in to TensorFlow.  See the screenshots below - the main operation is conv2D backprop - a very compute heavy operation.  We may get some performance improvement further with reduced precision - see the [`reduced_precision/`](../reduced_precision) folder.
 5 | 
 6 | 
 7 | # Running the TensorFlow Profiler
 8 | 
 9 | When you've captured your profile data, TensorBoard will dump it into the folder `logdir` (as above) and you will have to view it.  The simplest way, for this application, is to copy it to your own laptop if you have TensorFlow installed.  If not, you can run TensorBoard on ThetaGPU and use SSH port forwarding to view it on your own laptop.
10 | 
11 | Whatever you do, you can open TensorBoard like so:
12 | ```
13 | tensorboard --load_fast=false --logdir [your/own/path/to/logdir/]
14 | ```
15 | For my local macOS system with TensorFlow 2.5.0, TensorBoard did not show any data unless the experimntal fast loading logic (which theoretically offers 100x-400x shorter loading times) was disabled. See  https://github.com/tensorflow/tensorboard/issues/4784
16 | 
17 | Next, open your browser and navigate to `localhost:6006` (or, whatever port you forwarded to) and you'll see a screen like the one below:
18 | 
19 | ![TensorBoard Profiler Overview](./images/profiler_overview.png)
20 | 
21 | And, if you scroll down, you'll see the list of the top 10 most time consuming operations:
22 | 
23 | ![top 10](./images/top10_ops.png)
24 | 
25 | This list shows us that the top operations are largely all convolution ops (particularly backprop).  The profiler at the top also points out that 0% of the graph is in reduced precision, which could give us a speedup.  We'll try that next but first let's review the other tabs:
26 | 
27 | Here is the Kernel Statistics page:
28 | 
29 | ![kernel stats](./images/kernel-stats.png)
30 | 
31 | Again, this shows that the convolution operations are all the most dominant and equally distributed (roughly).
32 | 
33 | The TensorFlow statistics shows similar info:
34 | 
35 | ![tf stats](./images/tf-stats.png)
36 | 
37 | And there is also a timeline view of all ops (trace viewer)
38 | 
39 | ![timeline](./images/trace-viewer.png)
40 | 
41 | And zoomed:
42 | 
43 | ![timeline zoom](./images/trace-viewer-zoom.png)
44 | 
45 | Now, let's try running in reduced precision.
46 | 
47 | 
48 | # Using Port Forwarding
49 | 
50 | You can view the processes and how they occupy the compute resources in Tensorflow using tensorboard.
51 | 
52 | You can login to Theta using:
53 | ```bash
54 | # our proxy port, must be > 1024 and every user needs a different port
55 | export PORT=10001
56 | # login to theta with a port forwarding
57 | ssh -D $PORT user@theta.alcf.anl.gov
58 | # load any conda environment that has a compatible tensorboard installation
59 | module load conda
60 | # add CUDA libraries if you are running on ThetaGPU
61 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/lus/theta-fs0/software/thetagpu/cuda/TensorRT-8.5.2.2/lib:/lus/theta-fs0/software/thetagpu/cuda/nccl_2.16.2-1+cuda11.8_x86_64/lib:/lus/theta-fs0/software/thetagpu/cuda/cudnn-linux-x86_64-8.6.0.163_cuda11-archive/lib:/lus/theta-fs0/software/thetagpu/cuda/cuda-11.8.0/extras/CUPTI/lib64:/lus/theta-fs0/software/thetagpu/cuda/cuda-11.8.0/lib64
62 | # start tensorboard (load_fast==false is a recent setting that seems to be needed until Tensorflow work's out the bugs)
63 | tensorboard --bind_all --logdir . --load_fast=false
64 | ```
65 | Note the Port number that `tensorboard` reports when it starts up.
66 | 
67 | Only 1 user can use a specific port so if you get an error choose another port number larger than `1024`.
68 | 
69 | Once you have that setup. Set the Socks5 proxy of your favorite browser to host localhost and port $PORT (where $PORT is the value you used in the above script, like 10001). Now in the browser URL enter the login node on which you started tensorboard. For instance, if you are on thetalogin6, now you can type in thetalogin6.alcf.anl.gov:6006. Here 6006 is the port that tensorboard uses by default to start up it's web service, but may vary if you customize it.
70 | 


--------------------------------------------------------------------------------
/03_profileLearning/tf_profiler/images/kernel-stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/tf_profiler/images/kernel-stats.png


--------------------------------------------------------------------------------
/03_profileLearning/tf_profiler/images/profiler_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/tf_profiler/images/profiler_overview.png


--------------------------------------------------------------------------------
/03_profileLearning/tf_profiler/images/tf-stats.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/tf_profiler/images/tf-stats.png


--------------------------------------------------------------------------------
/03_profileLearning/tf_profiler/images/top10_ops.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/tf_profiler/images/top10_ops.png


--------------------------------------------------------------------------------
/03_profileLearning/tf_profiler/images/trace-viewer-zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/tf_profiler/images/trace-viewer-zoom.png


--------------------------------------------------------------------------------
/03_profileLearning/tf_profiler/images/trace-viewer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/03_profileLearning/tf_profiler/images/trace-viewer.png


--------------------------------------------------------------------------------
/03_profileLearning/train_MNIST.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | import argparse
  4 | import logging
  5 | from logging import handlers
  6 | 
  7 | import tensorflow as tf
  8 | import numpy
  9 | 
 10 | import horovod.tensorflow as hvd
 11 | 
 12 | 
 13 | def init_mpi():
 14 |     # Using the presence of an env variable to determine if we're using MPI:
 15 |     try:
 16 |         hvd.init()
 17 |         return hvd.rank(), hvd.size()
 18 |     except:
 19 |         if "mpirun" in sys.argv or "mpiexec" in sys.argv:
 20 |             raise Exception("MPI detected in command line but was not able to init!")
 21 |         return 0, 1
 22 | 
 23 | 
 24 | def configure_logger(rank):
 25 |     '''Configure a global logger
 26 | 
 27 |     Adds a stream handler and a file hander, buffers to file (10 lines) but not to stdout.
 28 | 
 29 |     Submit the MPI Rank
 30 | 
 31 |     '''
 32 |     logger = logging.getLogger()
 33 | 
 34 |     # Create a handler for STDOUT, but only on the root rank.
 35 |     # If not distributed, we still get 0 passed in here.
 36 |     if rank == 0:
 37 |         stream_handler = logging.StreamHandler()
 38 |         formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 39 |         stream_handler.setFormatter(formatter)
 40 |         handler = handlers.MemoryHandler(capacity=0, target=stream_handler)
 41 |         logger.addHandler(handler)
 42 | 
 43 |         # Add a file handler too:
 44 |         log_file = "process.log"
 45 |         file_handler = logging.FileHandler(log_file)
 46 |         file_handler.setFormatter(formatter)
 47 |         file_handler = handlers.MemoryHandler(capacity=10, target=file_handler)
 48 |         logger.addHandler(file_handler)
 49 | 
 50 |         logger.setLevel(logging.INFO)
 51 |     else:
 52 |         # in this case, MPI is available but it's not rank 0
 53 |         # create a null handler
 54 |         handler = logging.NullHandler()
 55 |         logger.addHandler(handler)
 56 |         logger.setLevel(logging.INFO)
 57 | 
 58 | 
 59 | class MNISTClassifier(tf.keras.models.Model):
 60 | 
 61 |     def __init__(self, activation=tf.nn.tanh):
 62 |         tf.keras.models.Model.__init__(self)
 63 | 
 64 |         self.conv_1 = tf.keras.layers.Conv2D(32, [3, 3], activation='relu')
 65 |         self.conv_2 = tf.keras.layers.Conv2D(64, [3, 3], activation='relu')
 66 |         self.pool_3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))
 67 |         self.drop_4 = tf.keras.layers.Dropout(0.25)
 68 |         self.dense_5 = tf.keras.layers.Dense(128, activation='relu')
 69 |         self.drop_6 = tf.keras.layers.Dropout(0.5)
 70 |         self.dense_7 = tf.keras.layers.Dense(10, activation='softmax')
 71 | 
 72 |     def call(self, inputs):
 73 |         '''
 74 |         Reshape at input and output:
 75 |         '''
 76 |         # batch_size = inputs.shape[0]
 77 | 
 78 |         x = self.conv_1(inputs)
 79 |         x = self.conv_2(x)
 80 |         x = self.pool_3(x)
 81 |         x = self.drop_4(x)
 82 |         x = tf.keras.layers.Flatten()(x)
 83 |         x = self.dense_5(x)
 84 |         x = self.drop_6(x)
 85 |         x = self.dense_7(x)
 86 | 
 87 |         return x
 88 | 
 89 | 
 90 | def compute_loss(y_true, y_pred):
 91 |     # if labels are integers, use sparse categorical crossentropy
 92 |     # network's final layer is softmax, so from_logtis=False
 93 |     scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
 94 |     # if labels are one-hot encoded, use standard crossentropy
 95 | 
 96 |     return scce(y_true, y_pred)  # .numpy()
 97 | 
 98 | 
 99 | def get_dataset():
100 | 
101 |     # Read in the mnist data so we have it loaded globally:
102 |     (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
103 |     x_train = x_train.astype(numpy.float32)
104 |     x_test  = x_test.astype(numpy.float32)
105 | 
106 |     x_train /= 255.
107 |     x_test  /= 255.
108 | 
109 |     y_train = y_train.astype(numpy.int32)
110 |     y_test  = y_test.astype(numpy.int32)
111 | 
112 |     return x_train, x_test, y_train, y_test
113 | 
114 | 
115 | def fetch_batch(_batch_size):
116 |     x_train, x_test, y_train, y_test = get_dataset()
117 | 
118 |     indexes = numpy.random.choice(a=x_train.shape[0], size=[_batch_size,])
119 | 
120 |     images = x_train[indexes].reshape(_batch_size, 28, 28, 1)
121 |     labels = y_train[indexes].reshape(_batch_size, 1)
122 | 
123 |     return images, labels
124 | 
125 | 
126 | # Here is a function that will manage the training loop for us:
127 | 
128 | def train_loop(batch_size, n_training_epochs, model, opt, global_size):
129 | 
130 |     logger = logging.getLogger()
131 | 
132 |     rank = hvd.rank()
133 |     for i_epoch in range(n_training_epochs):
134 | 
135 |         epoch_steps = int(60000/batch_size)
136 | 
137 |         for i_batch in range(epoch_steps):
138 | 
139 |             start = time.time()
140 | 
141 |             with tf.GradientTape() as tape:
142 |                 batch_data, y_true = fetch_batch(batch_size)
143 |                 y_pred = model(batch_data)
144 |                 loss = compute_loss(y_true, y_pred)
145 | 
146 |             if global_size != 1:
147 |                 tape = hvd.DistributedGradientTape(tape)
148 | 
149 |             trainable_vars = model.trainable_variables
150 | 
151 |             # Apply the update to the network (one at a time):
152 |             grads = tape.gradient(loss, trainable_vars)
153 | 
154 |             opt.apply_gradients(zip(grads, trainable_vars))
155 | 
156 |             end = time.time()
157 | 
158 |             images = batch_size*global_size
159 | 
160 |             logger.info(f"({i_epoch}, {i_batch}), Loss: {loss:.3f}, step_time: {end-start :.3f}, throughput: {images/(end-start):.3f} img/s.")
161 | 
162 | 
163 | def train_network(_batch_size, _training_iterations, _lr, global_size):
164 | 
165 |     mnist_model = MNISTClassifier()
166 | 
167 |     opt = tf.keras.optimizers.Adam(_lr)
168 | 
169 |     if global_size != 1:
170 |         hvd.broadcast_variables(mnist_model.variables, root_rank=0)
171 |         hvd.broadcast_variables(opt.variables(), root_rank=0)
172 | 
173 |     train_loop(_batch_size, _training_iterations, mnist_model, opt, global_size)
174 | 
175 | 
176 | if __name__ == '__main__':
177 | 
178 |     rank, size = init_mpi()
179 |     configure_logger(rank)
180 | 
181 |     parser = argparse.ArgumentParser(description='TensorFlow MNIST Example')
182 |     parser.add_argument('--batch_size', type=int, default=64, metavar='N',
183 |                         help='input batch size for training (default: 64)')
184 |     parser.add_argument('--epochs', type=int, default=10, metavar='N',
185 |                         help='number of epochs to train (default: 10)')
186 |     parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
187 |                         help='learning rate (default: 0.01)')
188 |     # parser.add_argument('--device', default='cpu',
189 |     #                     help='Wheter this is running on cpu or gpu')
190 |     # parser.add_argument('--num_inter', default=2, help='set number inter', type=int)
191 |     # parser.add_argument('--num_intra', default=0, help='set number intra', type=int)
192 |     # parser.add_argument('--warmup_epochs', default=3, help='number of warmup epochs',
193 |     # type=int)
194 | 
195 |     args = parser.parse_args()
196 |     scaled_lr = args.lr * hvd.size()
197 |     train_network(args.batch_size, args.epochs, scaled_lr, size)
198 | 


--------------------------------------------------------------------------------
/04_distributedLearning/ATPESC_2024_AIMLTrack_DDL_Zheng.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/04_distributedLearning/ATPESC_2024_AIMLTrack_DDL_Zheng.pdf


--------------------------------------------------------------------------------
/04_distributedLearning/DeepSpeed/ds_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": 512,
 3 |   "steps_per_print": 2000,
 4 |   "optimizer": {
 5 |     "type": "Adam",
 6 |     "params": {
 7 |       "lr": 0.01,
 8 |       "betas": [
 9 |         0.8,
10 |         0.999
11 |       ],
12 |       "eps": 1e-8,
13 |       "weight_decay": 3e-7
14 |     }
15 |   },
16 |   "scheduler": {
17 |     "type": "WarmupLR",
18 |     "params": {
19 |       "warmup_min_lr": 0,
20 |       "warmup_max_lr": 0.001,
21 |       "warmup_num_steps": 1000
22 |     }
23 |   },
24 |   "gradient_clipping": 1.0,
25 |   "prescale_gradients": false,
26 |   "fp16": {
27 |       "enabled": true,
28 |       "fp16_master_weights_and_grads": false,
29 |       "loss_scale": 0,
30 |       "loss_scale_window": 500,
31 |       "hysteresis": 2,
32 |       "min_loss_scale": 1,
33 |       "initial_scale_power": 15
34 |   },
35 |   "wall_clock_breakdown": false,
36 |   "zero_optimization": {
37 |       "stage": 0,
38 |       "allgather_partitions": true,
39 |       "reduce_scatter": true,
40 |       "allgather_bucket_size": 50000000,
41 |       "reduce_bucket_size": 50000000,
42 |       "overlap_comm": true,
43 |       "contiguous_gradients": true,
44 |       "cpu_offload": false
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/04_distributedLearning/Horovod/04_keras_cnn_concise.py:
--------------------------------------------------------------------------------
 1 | # Turn of TF logs
 2 | import os
 3 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
 4 | 
 5 | import tensorflow as tf
 6 | 
 7 | import numpy
 8 | import time
 9 | 
10 | 
11 | import argparse
12 | parser = argparse.ArgumentParser(description='Horovod',
13 |                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
14 | parser.add_argument('--device', default='gpu',
15 |                     help='Wheter this is running on cpu or gpu')
16 | args = parser.parse_args()
17 | 
18 | 
19 | gpus = tf.config.experimental.list_physical_devices('GPU')
20 | for gpu in gpus:
21 |     tf.config.experimental.set_memory_growth(gpu, True)
22 | 
23 | 
24 | # MNIST dataset 
25 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
26 | 
27 | x_train = x_train.astype(numpy.float32)
28 | x_test  = x_test.astype(numpy.float32)
29 | 
30 | x_train /= 255.
31 | x_test  /= 255.
32 | 
33 | y_train = y_train.astype(numpy.int32)
34 | y_test  = y_test.astype(numpy.int32)
35 | 
36 | 
37 | # Convolutional model
38 | 
39 | class MNISTClassifier(tf.keras.models.Model):
40 | 
41 |     def __init__(self, activation=tf.nn.tanh):
42 |         tf.keras.models.Model.__init__(self)
43 | 
44 |         self.conv_1 = tf.keras.layers.Conv2D(32, [3, 3], activation='relu')
45 |         self.conv_2 = tf.keras.layers.Conv2D(64, [3, 3], activation='relu')
46 |         self.pool_3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))
47 |         self.drop_4 = tf.keras.layers.Dropout(0.25)
48 |         self.dense_5 = tf.keras.layers.Dense(128, activation='relu')
49 |         self.drop_6 = tf.keras.layers.Dropout(0.5)
50 |         self.dense_7 = tf.keras.layers.Dense(10, activation='softmax')
51 | 
52 |     def call(self, inputs):
53 | 
54 |         x = self.conv_1(inputs)
55 |         x = self.conv_2(x)
56 |         x = self.pool_3(x)
57 |         x = self.drop_4(x)
58 |         x = tf.keras.layers.Flatten()(x)
59 |         x = self.dense_5(x)
60 |         x = self.drop_6(x)
61 |         x = self.dense_7(x)
62 | 
63 |         return x
64 | 
65 | 
66 | 
67 | def train_network_concise(_batch_size, _n_training_epochs, _lr):
68 | 
69 |     cnn_model = MNISTClassifier()
70 | 
71 |     cnn_model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=['accuracy'])
72 |     
73 |     x_train_reshaped = numpy.expand_dims(x_train, -1)
74 | 
75 |     history = cnn_model.fit(x_train_reshaped, y_train, batch_size=_batch_size, epochs=_n_training_epochs)
76 |     return history, cnn_model
77 | 
78 | batch_size = 512
79 | epochs = 20
80 | lr = .01
81 | t0 = time.time()
82 | history, cnn_model = train_network_concise(batch_size, epochs, lr)
83 | t1 = time.time()
84 | print("Total time is %s sec" %(t1-t0))
85 | 


--------------------------------------------------------------------------------
/04_distributedLearning/Horovod/04_keras_cnn_concise_hvd.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
  3 | os.environ['MPICH_GPU_SUPPORT_ENABLED']='0'
  4 | import tensorflow as tf
  5 | 
  6 | #HVD: (1) Initializing Horovod
  7 | import horovod.tensorflow.keras as hvd
  8 | hvd.init()
  9 | print("I am rank %s of %s" %(hvd.rank(), hvd.size()))
 10 | 
 11 | import numpy
 12 | import time
 13 | import argparse
 14 | parser = argparse.ArgumentParser(description='Horovod',
 15 |                                  formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 16 | parser.add_argument('--device', default='gpu',
 17 |                     help='Whether this is running on cpu or gpu')
 18 | parser.add_argument('--epochs', default=50, type=int, help='Number of epochs to run')
 19 | parser.add_argument('--warmup_epochs', default=3, type=int, help='Number of epochs to run')
 20 | parser.add_argument('--learning_rate', '--lr', default=0.01, type=float)
 21 | parser.add_argument('--batch_size', default=512, type=int)
 22 | args = parser.parse_args()
 23 | 
 24 | from tensorflow.python.client import device_lib
 25 | 
 26 | def get_available_devices():
 27 |     local_device_protos = device_lib.list_local_devices()
 28 |     return [x.name for x in local_device_protos if x.device_type == 'GPU' or x.device_type == 'CPU']
 29 | 
 30 | gpus = tf.config.experimental.list_physical_devices('GPU')
 31 | for gpu in gpus:
 32 |     tf.config.experimental.set_memory_growth(gpu, True)
 33 |     #HVD: (2) Pin one GPU to specific horovod worker
 34 |     if gpus:
 35 |         tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
 36 |         
 37 | tf.config.threading.set_intra_op_parallelism_threads(0)
 38 | tf.config.threading.set_inter_op_parallelism_threads(2)
 39 |         
 40 | 
 41 | 
 42 | # MNIST dataset 
 43 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
 44 | 
 45 | x_train = x_train.astype(numpy.float32)
 46 | x_test  = x_test.astype(numpy.float32)
 47 | 
 48 | x_train /= 255.
 49 | x_test  /= 255.
 50 | 
 51 | y_train = y_train.astype(numpy.int32)
 52 | y_test  = y_test.astype(numpy.int32)
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | # Convolutional model
 59 | 
 60 | class MNISTClassifier(tf.keras.models.Model):
 61 | 
 62 |     def __init__(self, activation=tf.nn.tanh):
 63 |         tf.keras.models.Model.__init__(self)
 64 | 
 65 |         self.conv_1 = tf.keras.layers.Conv2D(32, [3, 3], activation='relu')
 66 |         self.conv_2 = tf.keras.layers.Conv2D(64, [3, 3], activation='relu')
 67 |         self.pool_3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))
 68 |         self.drop_4 = tf.keras.layers.Dropout(0.25)
 69 |         self.dense_5 = tf.keras.layers.Dense(128, activation='relu')
 70 |         self.drop_6 = tf.keras.layers.Dropout(0.5)
 71 |         self.dense_7 = tf.keras.layers.Dense(10, activation='softmax')
 72 | 
 73 |     def call(self, inputs):
 74 | 
 75 |         x = self.conv_1(inputs)
 76 |         x = self.conv_2(x)
 77 |         x = self.pool_3(x)
 78 |         x = self.drop_4(x)
 79 |         x = tf.keras.layers.Flatten()(x)
 80 |         x = self.dense_5(x)
 81 |         x = self.drop_6(x)
 82 |         x = self.dense_7(x)
 83 | 
 84 |         return x
 85 | 
 86 | 
 87 | 
 88 | def train_network_concise(_batch_size, _n_training_epochs, _lr):
 89 | 
 90 |     cnn_model = MNISTClassifier()
 91 |     #HVD: (3) scale the learning rate
 92 |     opt = tf.optimizers.Adam(_lr*hvd.size())
 93 |     #HVD: (4) add Horovod Distributed Optimizer
 94 |     opt = hvd.DistributedOptimizer(opt)
 95 |     # Specify `experimental_run_tf_function=False`
 96 |     cnn_model.compile(loss="sparse_categorical_crossentropy", optimizer=opt, metrics=['accuracy'],
 97 |                       experimental_run_tf_function=False)
 98 |     #HVD: (5) Define call back
 99 |     callbacks = [
100 |         # broad cast 
101 |         hvd.callbacks.BroadcastGlobalVariablesCallback(0),
102 |         # Average metric at the end of every epoch
103 |         hvd.callbacks.MetricAverageCallback(),
104 |         # Warmup 
105 |         hvd.callbacks.LearningRateWarmupCallback(warmup_epochs=args.warmup_epochs, verbose=1, initial_lr=_lr*hvd.size()),
106 |     ]
107 |     #HVD: (6) save checkpoints only on worker 0
108 |     if hvd.rank()==0:
109 |         callbacks.append(tf.keras.callbacks.ModelCheckpoint('./checkpoint-{epoch}.h5'))
110 |     verbose=0
111 |     if hvd.rank()==0:
112 |         verbose=1
113 |     x_train_reshaped = numpy.expand_dims(x_train, -1)
114 |     #HVD: (7) Adjust the number of steps per epochs
115 |     if (args.device=='cpu'):
116 |         with tf.device('/device:CPU:0'):
117 |             history = cnn_model.fit(x_train_reshaped, y_train, batch_size=_batch_size, epochs=_n_training_epochs, callbacks=callbacks, steps_per_epoch=60000//hvd.size()//_batch_size, verbose=verbose)
118 |     else:
119 |         history = cnn_model.fit(x_train_reshaped, y_train, batch_size=_batch_size, epochs=_n_training_epochs, callbacks=callbacks, steps_per_epoch=60000//hvd.size()//_batch_size, verbose=verbose)
120 |     return history, cnn_model
121 | 
122 | batch_size = args.batch_size
123 | epochs = args.epochs
124 | lr = args.learning_rate
125 | history, cnn_model = train_network_concise(batch_size, 1, lr)
126 | 
127 | t0 = time.time()
128 | history, cnn_model = train_network_concise(batch_size, epochs-1, lr)
129 | t1 = time.time()
130 | if (hvd.rank()==0):
131 |     print("Hvd Procs %d Total time: %s second" %(hvd.size(),t1-t0))
132 | 


--------------------------------------------------------------------------------
/04_distributedLearning/Horovod/04_keras_cnn_verbose.py:
--------------------------------------------------------------------------------
  1 | # Horovod example
  2 | 
  3 | import tensorflow as tf
  4 | 
  5 | import numpy
  6 | import time
  7 | 
  8 | import argparse
  9 | parser = argparse.ArgumentParser(description='TensorFlow MNIST Example')
 10 | parser.add_argument('--epochs', type=int, default=50, metavar='N',
 11 |                     help='number of epochs to train (default: 10)')
 12 | parser.add_argument('--device', default='gpu',
 13 |                     help='Wheter this is running on cpu or gpu')
 14 | parser.add_argument('--num_inter', default=2, help='set number inter', type=int)
 15 | parser.add_argument('--num_intra', default=0, help='set number intra', type=int)
 16 | 
 17 | args = parser.parse_args()
 18 | tf.config.threading.set_intra_op_parallelism_threads(args.num_intra)
 19 | tf.config.threading.set_inter_op_parallelism_threads(args.num_inter)
 20 | gpus = tf.config.experimental.list_physical_devices('GPU')
 21 | for gpu in gpus:
 22 |     tf.config.experimental.set_memory_growth(gpu, True)
 23 | 
 24 | 
 25 | # MNIST dataset 
 26 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
 27 | 
 28 | x_train = x_train.astype(numpy.float32)
 29 | x_test  = x_test.astype(numpy.float32)
 30 | 
 31 | x_train /= 255.
 32 | x_test  /= 255.
 33 | 
 34 | y_train = y_train.astype(numpy.int32)
 35 | y_test  = y_test.astype(numpy.int32)
 36 | 
 37 | 
 38 | # Convolutional model
 39 | 
 40 | class MNISTClassifier(tf.keras.models.Model):
 41 | 
 42 |     def __init__(self, activation=tf.nn.tanh):
 43 |         tf.keras.models.Model.__init__(self)
 44 | 
 45 |         self.conv_1 = tf.keras.layers.Conv2D(32, [3, 3], activation='relu')
 46 |         self.conv_2 = tf.keras.layers.Conv2D(64, [3, 3], activation='relu')
 47 |         self.pool_3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))
 48 |         self.drop_4 = tf.keras.layers.Dropout(0.25)
 49 |         self.dense_5 = tf.keras.layers.Dense(128, activation='relu')
 50 |         self.drop_6 = tf.keras.layers.Dropout(0.5)
 51 |         self.dense_7 = tf.keras.layers.Dense(10, activation='softmax')
 52 | 
 53 |     def call(self, inputs):
 54 | 
 55 |         x = self.conv_1(inputs)
 56 |         x = self.conv_2(x)
 57 |         x = self.pool_3(x)
 58 |         x = self.drop_4(x)
 59 |         x = tf.keras.layers.Flatten()(x)
 60 |         x = self.dense_5(x)
 61 |         x = self.drop_6(x)
 62 |         x = self.dense_7(x)
 63 | 
 64 |         return x
 65 | 
 66 | def compute_loss(y_true, y_pred):
 67 |     # if labels are integers, use sparse categorical crossentropy
 68 |     # network's final layer is softmax, so from_logtis=False
 69 |     scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
 70 |     # if labels are one-hot encoded, use standard crossentropy
 71 | 
 72 |     return scce(y_true, y_pred)
 73 | 
 74 | 
 75 | def forward_pass(model, batch_data, y_true):
 76 |     y_pred = model(batch_data)
 77 |     loss = compute_loss(y_true, y_pred)
 78 |     return loss
 79 | 
 80 | 
 81 | def train_loop(batch_size, n_training_epochs, model, opt):
 82 |     
 83 |     @tf.function()
 84 |     def train_iteration(data, y_true, model, opt):
 85 |         with tf.GradientTape() as tape:
 86 |             loss = forward_pass(model, data, y_true)
 87 | 
 88 |         trainable_vars = model.trainable_variables
 89 | 
 90 |         # Apply the update to the network (one at a time):
 91 |         grads = tape.gradient(loss, trainable_vars)
 92 | 
 93 |         opt.apply_gradients(zip(grads, trainable_vars))
 94 |         return loss
 95 | 
 96 |     for i_epoch in range(n_training_epochs):
 97 |         print("beginning epoch %d" % i_epoch)
 98 |         start = time.time()
 99 | 
100 |         epoch_steps = int(60000/batch_size)
101 |         dataset.shuffle(60000) # Shuffle the whole dataset in memory
102 |         batches = dataset.batch(batch_size=batch_size, drop_remainder=True)
103 |         
104 |         for i_batch, (batch_data, y_true) in enumerate(batches):
105 |             batch_data = tf.reshape(batch_data, [-1, 28, 28, 1])
106 |             if (args.device=='cpu'):
107 |                 with tf.device("/cpu:0"):
108 |                     loss = train_iteration(batch_data, y_true, model, opt)
109 |             else:
110 |                 loss = train_iteration(batch_data, y_true, model, opt)
111 |             
112 |         end = time.time()
113 |         print("took %1.1f seconds for epoch #%d" % (end-start, i_epoch))
114 | 
115 | 
116 | def train_network(_batch_size, _n_training_epochs, _lr):
117 | 
118 |     mnist_model = MNISTClassifier()
119 | 
120 |     opt = tf.keras.optimizers.Adam(_lr)
121 | 
122 |     train_loop(_batch_size, _n_training_epochs, mnist_model, opt)
123 | 
124 | 
125 | dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
126 | dataset.shuffle(60000)
127 | 
128 | batch_size = 512
129 | epochs = args.epochs
130 | lr = .01
131 | train_network(batch_size, epochs, lr)
132 | 


--------------------------------------------------------------------------------
/04_distributedLearning/Horovod/04_keras_cnn_verbose_hvd.py:
--------------------------------------------------------------------------------
  1 | # Horovod example
  2 | import os
  3 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
  4 | os.environ['MPICH_GPU_SUPPORT_ENABLED']='0'
  5 | 
  6 | import tensorflow as tf
  7 | 
  8 | import numpy
  9 | import time
 10 | 
 11 | #HVD: (1) Import horovod
 12 | import horovod.tensorflow as hvd
 13 | hvd.init()
 14 | print("I am rank %d of %d"%(hvd.rank(), hvd.size()))
 15 | 
 16 | import argparse
 17 | parser = argparse.ArgumentParser(description='TensorFlow MNIST Example')
 18 | parser.add_argument('--epochs', default=50,
 19 |                     type=int, help='Number of epochs to run')
 20 | 
 21 | parser.add_argument('--device', default='gpu',
 22 |                     help='Wheter this is running on cpu or gpu')
 23 | parser.add_argument('--num_inter', default=2, help='set number inter', type=int)
 24 | parser.add_argument('--num_intra', default=0, help='set number intra', type=int)
 25 | parser.add_argument('--batch_size', default=512, type=int)
 26 | args = parser.parse_args()
 27 | tf.config.threading.set_intra_op_parallelism_threads(args.num_intra)
 28 | tf.config.threading.set_inter_op_parallelism_threads(args.num_inter)
 29 | gpus = tf.config.experimental.list_physical_devices('GPU')
 30 | for gpu in gpus:
 31 |     tf.config.experimental.set_memory_growth(gpu, True)
 32 | #HVD: (2) Pin GPU        
 33 | if gpus:
 34 |     tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
 35 | 
 36 | # MNIST dataset 
 37 | (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
 38 | 
 39 | x_train = x_train.astype(numpy.float32)
 40 | x_test  = x_test.astype(numpy.float32)
 41 | 
 42 | x_train /= 255.
 43 | x_test  /= 255.
 44 | 
 45 | y_train = y_train.astype(numpy.int32)
 46 | y_test  = y_test.astype(numpy.int32)
 47 | 
 48 | dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
 49 | #HVD: sharding the dataset
 50 | dataset = dataset.shard(hvd.size(), hvd.rank())
 51 | dataset = dataset.shuffle(60000)
 52 | batches = dataset.batch(batch_size = args.batch_size, drop_remainder=True)
 53 | 
 54 | # Convolutional model
 55 | 
 56 | class MNISTClassifier(tf.keras.models.Model):
 57 | 
 58 |     def __init__(self, activation=tf.nn.tanh):
 59 |         tf.keras.models.Model.__init__(self)
 60 | 
 61 |         self.conv_1 = tf.keras.layers.Conv2D(32, [3, 3], activation='relu')
 62 |         self.conv_2 = tf.keras.layers.Conv2D(64, [3, 3], activation='relu')
 63 |         self.pool_3 = tf.keras.layers.MaxPooling2D(pool_size=(2, 2))
 64 |         self.drop_4 = tf.keras.layers.Dropout(0.25)
 65 |         self.dense_5 = tf.keras.layers.Dense(128, activation='relu')
 66 |         self.drop_6 = tf.keras.layers.Dropout(0.5)
 67 |         self.dense_7 = tf.keras.layers.Dense(10, activation='softmax')
 68 | 
 69 |     def call(self, inputs):
 70 | 
 71 |         x = self.conv_1(inputs)
 72 |         x = self.conv_2(x)
 73 |         x = self.pool_3(x)
 74 |         x = self.drop_4(x)
 75 |         x = tf.keras.layers.Flatten()(x)
 76 |         x = self.dense_5(x)
 77 |         x = self.drop_6(x)
 78 |         x = self.dense_7(x)
 79 | 
 80 |         return x
 81 | 
 82 | def compute_loss(y_true, y_pred):
 83 |     # if labels are integers, use sparse categorical crossentropy
 84 |     # network's final layer is softmax, so from_logtis=False
 85 |     scce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
 86 |     # if labels are one-hot encoded, use standard crossentropy
 87 | 
 88 |     return scce(y_true, y_pred)
 89 | 
 90 | 
 91 | def forward_pass(model, batch_data, y_true):
 92 |     y_pred = model(batch_data)
 93 |     loss = compute_loss(y_true, y_pred)
 94 |     return loss
 95 | 
 96 | 
 97 | def train_loop(batch_size, n_training_epochs, model, opt):
 98 |     
 99 |     @tf.function()
100 |     def train_iteration(data, y_true, model, opt):
101 |         with tf.GradientTape() as tape:
102 |             loss = forward_pass(model, data, y_true)
103 | 
104 |         trainable_vars = model.trainable_variables
105 |         #HVD: (4) distributed tape
106 |         tape = hvd.DistributedGradientTape(tape)
107 | 
108 |         # Apply the update to the network (one at a time):
109 |         grads = tape.gradient(loss, trainable_vars)
110 | 
111 |         opt.apply_gradients(zip(grads, trainable_vars))
112 |         return loss
113 |     
114 |     for i_epoch in range(n_training_epochs):
115 |         if (hvd.rank==0):
116 |             print("beginning epoch %d" % i_epoch)
117 |         start = time.time()
118 |         total_loss = 0.0
119 |         for i_batch, (batch_data, y_true) in enumerate(batches):
120 |             batch_data = tf.reshape(batch_data, [-1, 28, 28, 1])
121 |             if (args.device=='cpu'):
122 |                 with tf.device("/cpu:0"):
123 |                     loss = train_iteration(batch_data, y_true, model, opt)
124 |             else:
125 |                 loss = train_iteration(batch_data, y_true, model, opt)
126 |             total_loss += loss
127 |             #HVD: (5) broadcast from 0 (need to be done after first step to ensured that optimizer is initialized)
128 |             if (i_batch==0 and i_epoch==0):
129 |                 hvd.broadcast_variables(model.variables, root_rank=0)
130 |                 hvd.broadcast_variables(opt.variables(), root_rank=0)
131 |         #HVD: (6) average metrics
132 |         total_loss = hvd.allreduce(total_loss, average=False)
133 |         end = time.time()
134 |         if (hvd.rank()==0):
135 |             print("took %4.4f seconds for epoch #%d - %s" % (end-start, i_epoch, tf.print("loss: ", total_loss)))
136 | 
137 | 
138 | def train_network(_batch_size, _n_training_epochs, _lr):
139 | 
140 |     mnist_model = MNISTClassifier()
141 |     #HVD: (3) scale learning rate
142 |     opt = tf.keras.optimizers.Adam(_lr*hvd.size())
143 | 
144 |     train_loop(_batch_size, _n_training_epochs, mnist_model, opt)
145 | 
146 | 
147 | batch_size = 512
148 | epochs = args.epochs
149 | lr = .01
150 | train_network(batch_size, epochs, lr)
151 | 


--------------------------------------------------------------------------------
/04_distributedLearning/Horovod/mpitrace/cpu/#mpi_profile.2500161.0#:
--------------------------------------------------------------------------------
 1 | 
 2 | Data for MPI rank 0 of 8:
 3 | Times and statistics from MPI_Init() to MPI_Finalize().
 4 | -----------------------------------------------------------------------
 5 | MPI Routine                        #calls     avg. bytes      time(sec)
 6 | -----------------------------------------------------------------------
 7 | MPI_Comm_rank                           5            0.0          0.000
 8 | MPI_Comm_size                           3            0.0          0.000
 9 | MPI_Bcast                             150       192058.6          0.021
10 | MPI_Allreduce                       77340        44324.1        106.664
11 | MPI_Gather                             50            4.0          0.003
12 | MPI_Gatherv                            50            0.0          0.001
13 | MPI_Allgather                           2            4.0          0.000
14 | -----------------------------------------------------------------
15 | total communication time = 106.689 seconds.
16 | total elapsed time       = 128.903 seconds.
17 | user cpu time            = 2652.547 seconds.
18 | system time              = 332.597 seconds.
19 | max resident set size    = 1901.574 MBytes.
20 | 
21 | Rank 7 reported the largest memory utilization : 1961.68 MBytes
22 | Rank 0 reported the largest elapsed time : 128.90 sec
23 | 
24 | -----------------------------------------------------------------
25 | Message size distributions:
26 | 
27 | MPI_Bcast                 #calls    avg. bytes      time(sec)
28 |                               50           4.0          0.000
29 |                                2           8.0          0.000
30 |                               39          25.0          0.000
31 |                                6          40.0          0.000
32 |                                6         128.0          0.000
33 |                                7         245.9          0.000
34 |                               11         406.3          0.000
35 |                                1         553.0          0.000
36 |                                8        1193.2          0.000
37 |                                2        2829.0          0.000
38 |                                6        5120.0          0.000
39 |                                6       73728.0          0.003
40 |                                6     4718592.0          0.017
41 | 
42 | MPI_Allreduce             #calls    avg. bytes      time(sec)
43 |                              102           4.0          0.001
44 |                            72893          16.0         86.606
45 |                               16          40.0          0.003
46 |                              709         128.0          0.093
47 |                              709         256.0          0.149
48 |                               10         512.0          0.002
49 |                               76         552.0          0.075
50 |                              713        1154.2          0.069
51 |                              684        5607.9          0.645
52 |                              714       73730.2          0.176
53 |                              714     4718824.9         18.844
54 | 
55 | MPI_Gather                #calls    avg. bytes      time(sec)
56 |                               50           4.0          0.003
57 | 
58 | MPI_Allgather             #calls    avg. bytes      time(sec)
59 |                                2           4.0          0.000
60 | 
61 | -----------------------------------------------------------------
62 | 
63 | Communication summary for all tasks:
64 | 
65 |   minimum communication time = 105.854 sec for task 4
66 |   median  communication time = 107.650 sec for task 1
67 |   maximum communication time = 109.122 sec for task 5
68 | 
69 | 
70 | MPI timing summary for all ranks:
71 | taskid       hostname     comm(s)  elapsed(s)     user(s)   system(s)    size(MB)    switches
72 |      0     thetagpu23     106.69      128.90     2652.55      332.60     1901.57     1791292
73 |      1     thetagpu23     107.65      128.90     2633.87      266.90     1879.50     1793841
74 |      2     thetagpu23     108.69      128.90     2683.13      236.28     1928.97     1795163
75 |      3     thetagpu23     109.09      128.90     2645.00      239.66     1892.77     1805047
76 |      4     thetagpu23     105.85      128.90     2616.02      262.91     1864.30     1785421
77 |      5     thetagpu23     109.12      128.90     2625.73      271.42     1870.32     1796652
78 |      6     thetagpu23     107.05      128.90     2619.91      294.70     1865.26     1810114
79 |      7     thetagpu23     107.69      128.90     2652.56      236.55     1961.68     1795444
80 | 


--------------------------------------------------------------------------------
/04_distributedLearning/Horovod/mpitrace/cpu/mpi_profile.2500161.0:
--------------------------------------------------------------------------------
 1 | Data for MPI rank 0 of 8:
 2 | Times and statistics from MPI_Init() to MPI_Finalize().
 3 | -----------------------------------------------------------------------
 4 | MPI Routine                        #calls     avg. bytes      time(sec)
 5 | -----------------------------------------------------------------------
 6 | MPI_Comm_rank                           5            0.0          0.000
 7 | MPI_Comm_size                           3            0.0          0.000
 8 | MPI_Bcast                             150       192058.6          0.021
 9 | MPI_Allreduce                       77340        44324.1        106.664
10 | MPI_Gather                             50            4.0          0.003
11 | MPI_Gatherv                            50            0.0          0.001
12 | MPI_Allgather                           2            4.0          0.000
13 | -----------------------------------------------------------------
14 | total communication time = 106.689 seconds.
15 | total elapsed time       = 128.903 seconds.
16 | user cpu time            = 2652.547 seconds.
17 | system time              = 332.597 seconds.
18 | max resident set size    = 1901.574 MBytes.
19 | 
20 | Rank 7 reported the largest memory utilization : 1961.68 MBytes
21 | Rank 0 reported the largest elapsed time : 128.90 sec
22 | 
23 | -----------------------------------------------------------------
24 | Message size distributions:
25 | 
26 | MPI_Bcast                 #calls    avg. bytes      time(sec)
27 |                               50           4.0          0.000
28 |                                2           8.0          0.000
29 |                               39          25.0          0.000
30 |                                6          40.0          0.000
31 |                                6         128.0          0.000
32 |                                7         245.9          0.000
33 |                               11         406.3          0.000
34 |                                1         553.0          0.000
35 |                                8        1193.2          0.000
36 |                                2        2829.0          0.000
37 |                                6        5120.0          0.000
38 |                                6       73728.0          0.003
39 |                                6     4718592.0          0.017
40 | 
41 | MPI_Allreduce             #calls    avg. bytes      time(sec)
42 |                              102           4.0          0.001
43 |                            72893          16.0         86.606
44 |                               16          40.0          0.003
45 |                              709         128.0          0.093
46 |                              709         256.0          0.149
47 |                               10         512.0          0.002
48 |                               76         552.0          0.075
49 |                              713        1154.2          0.069
50 |                              684        5607.9          0.645
51 |                              714       73730.2          0.176
52 |                              714     4718824.9         18.844
53 | 
54 | MPI_Gather                #calls    avg. bytes      time(sec)
55 |                               50           4.0          0.003
56 | 
57 | MPI_Allgather             #calls    avg. bytes      time(sec)
58 |                                2           4.0          0.000
59 | 
60 | -----------------------------------------------------------------
61 | 
62 | Communication summary for all tasks:
63 | 
64 |   minimum communication time = 105.854 sec for task 4
65 |   median  communication time = 107.650 sec for task 1
66 |   maximum communication time = 109.122 sec for task 5
67 | 
68 | 
69 | MPI timing summary for all ranks:
70 | taskid       hostname     comm(s)  elapsed(s)     user(s)   system(s)    size(MB)    switches
71 |      0     thetagpu23     106.69      128.90     2652.55      332.60     1901.57     1791292
72 |      1     thetagpu23     107.65      128.90     2633.87      266.90     1879.50     1793841
73 |      2     thetagpu23     108.69      128.90     2683.13      236.28     1928.97     1795163
74 |      3     thetagpu23     109.09      128.90     2645.00      239.66     1892.77     1805047
75 |      4     thetagpu23     105.85      128.90     2616.02      262.91     1864.30     1785421
76 |      5     thetagpu23     109.12      128.90     2625.73      271.42     1870.32     1796652
77 |      6     thetagpu23     107.05      128.90     2619.91      294.70     1865.26     1810114
78 |      7     thetagpu23     107.69      128.90     2652.56      236.55     1961.68     1795444
79 | 


--------------------------------------------------------------------------------
/04_distributedLearning/Horovod/mpitrace/cpu/mpi_profile.2500161.1:
--------------------------------------------------------------------------------
 1 | Data for MPI rank 1 of 8:
 2 | Times and statistics from MPI_Init() to MPI_Finalize().
 3 | -----------------------------------------------------------------------
 4 | MPI Routine                        #calls     avg. bytes      time(sec)
 5 | -----------------------------------------------------------------------
 6 | MPI_Comm_rank                           5            0.0          0.000
 7 | MPI_Comm_size                           3            0.0          0.000
 8 | MPI_Bcast                             150       192058.6          0.027
 9 | MPI_Allreduce                       77340        44324.1        107.623
10 | MPI_Gather                             50            4.0          0.000
11 | MPI_Gatherv                            50          187.9          0.000
12 | MPI_Allgather                           2            4.0          0.000
13 | -----------------------------------------------------------------
14 | MPI task 1 of 8 had the median communication time.
15 | total communication time = 107.650 seconds.
16 | total elapsed time       = 128.903 seconds.
17 | user cpu time            = 2633.868 seconds.
18 | system time              = 266.897 seconds.
19 | max resident set size    = 1879.500 MBytes.
20 | 
21 | Rank 7 reported the largest memory utilization : 1961.68 MBytes
22 | Rank 0 reported the largest elapsed time : 128.90 sec
23 | 
24 | -----------------------------------------------------------------
25 | Message size distributions:
26 | 
27 | MPI_Bcast                 #calls    avg. bytes      time(sec)
28 |                               50           4.0          0.005
29 |                                2           8.0          0.000
30 |                               39          25.0          0.000
31 |                                6          40.0          0.000
32 |                                6         128.0          0.000
33 |                                7         245.9          0.000
34 |                               11         406.3          0.000
35 |                                1         553.0          0.000
36 |                                8        1193.2          0.000
37 |                                2        2829.0          0.000
38 |                                6        5120.0          0.000
39 |                                6       73728.0          0.003
40 |                                6     4718592.0          0.018
41 | 
42 | MPI_Allreduce             #calls    avg. bytes      time(sec)
43 |                              102           4.0          0.001
44 |                            72893          16.0         87.583
45 |                               16          40.0          0.003
46 |                              709         128.0          0.099
47 |                              709         256.0          0.181
48 |                               10         512.0          0.004
49 |                               76         552.0          0.063
50 |                              713        1154.2          0.072
51 |                              684        5607.9          0.591
52 |                              714       73730.2          0.178
53 |                              714     4718824.9         18.849
54 | 
55 | MPI_Gather                #calls    avg. bytes      time(sec)
56 |                               50           4.0          0.000
57 | 
58 | MPI_Gatherv               #calls    avg. bytes      time(sec)
59 |                               39          25.0          0.000
60 |                                1         121.0          0.000
61 |                                4         239.0          0.000
62 |                                1         257.0          0.000
63 |                                1         601.0          0.000
64 |                                2        1033.0          0.000
65 |                                2        2209.0          0.000
66 | 
67 | MPI_Allgather             #calls    avg. bytes      time(sec)
68 |                                2           4.0          0.000
69 | 
70 | 


--------------------------------------------------------------------------------
/04_distributedLearning/Horovod/mpitrace/cpu/mpi_profile.2500161.4:
--------------------------------------------------------------------------------
 1 | Data for MPI rank 4 of 8:
 2 | Times and statistics from MPI_Init() to MPI_Finalize().
 3 | -----------------------------------------------------------------------
 4 | MPI Routine                        #calls     avg. bytes      time(sec)
 5 | -----------------------------------------------------------------------
 6 | MPI_Comm_rank                           5            0.0          0.000
 7 | MPI_Comm_size                           3            0.0          0.000
 8 | MPI_Bcast                             150       192058.6          0.026
 9 | MPI_Allreduce                       77340        44324.1        105.827
10 | MPI_Gather                             50            4.0          0.000
11 | MPI_Gatherv                            50          188.2          0.000
12 | MPI_Allgather                           2            4.0          0.000
13 | -----------------------------------------------------------------
14 | MPI task 4 of 8 had the minimum communication time.
15 | total communication time = 105.854 seconds.
16 | total elapsed time       = 128.903 seconds.
17 | user cpu time            = 2616.017 seconds.
18 | system time              = 262.911 seconds.
19 | max resident set size    = 1864.305 MBytes.
20 | 
21 | Rank 7 reported the largest memory utilization : 1961.68 MBytes
22 | Rank 0 reported the largest elapsed time : 128.90 sec
23 | 
24 | -----------------------------------------------------------------
25 | Message size distributions:
26 | 
27 | MPI_Bcast                 #calls    avg. bytes      time(sec)
28 |                               50           4.0          0.005
29 |                                2           8.0          0.000
30 |                               39          25.0          0.000
31 |                                6          40.0          0.000
32 |                                6         128.0          0.000
33 |                                7         245.9          0.000
34 |                               11         406.3          0.000
35 |                                1         553.0          0.000
36 |                                8        1193.2          0.000
37 |                                2        2829.0          0.000
38 |                                6        5120.0          0.000
39 |                                6       73728.0          0.002
40 |                                6     4718592.0          0.019
41 | 
42 | MPI_Allreduce             #calls    avg. bytes      time(sec)
43 |                              102           4.0          0.001
44 |                            72893          16.0         85.829
45 |                               16          40.0          0.003
46 |                              709         128.0          0.099
47 |                              709         256.0          0.092
48 |                               10         512.0          0.004
49 |                               76         552.0          0.075
50 |                              713        1154.2          0.069
51 |                              684        5607.9          0.610
52 |                              714       73730.2          0.178
53 |                              714     4718824.9         18.866
54 | 
55 | MPI_Gather                #calls    avg. bytes      time(sec)
56 |                               50           4.0          0.000
57 | 
58 | MPI_Gatherv               #calls    avg. bytes      time(sec)
59 |                               39          25.0          0.000
60 |                                1         121.0          0.000
61 |                                4         239.0          0.000
62 |                                1         257.0          0.000
63 |                                1         601.0          0.000
64 |                                2        1041.0          0.000
65 |                                2        2209.0          0.000
66 | 
67 | MPI_Allgather             #calls    avg. bytes      time(sec)
68 |                                2           4.0          0.000
69 | 
70 | 


--------------------------------------------------------------------------------
/04_distributedLearning/Horovod/mpitrace/cpu/mpi_profile.2500161.5:
--------------------------------------------------------------------------------
 1 | Data for MPI rank 5 of 8:
 2 | Times and statistics from MPI_Init() to MPI_Finalize().
 3 | -----------------------------------------------------------------------
 4 | MPI Routine                        #calls     avg. bytes      time(sec)
 5 | -----------------------------------------------------------------------
 6 | MPI_Comm_rank                           5            0.0          0.000
 7 | MPI_Comm_size                           3            0.0          0.000
 8 | MPI_Bcast                             150       192058.6          0.027
 9 | MPI_Allreduce                       77340        44324.1        109.094
10 | MPI_Gather                             50            4.0          0.000
11 | MPI_Gatherv                            50          187.4          0.000
12 | MPI_Allgather                           2            4.0          0.000
13 | -----------------------------------------------------------------
14 | MPI task 5 of 8 had the maximum communication time.
15 | total communication time = 109.122 seconds.
16 | total elapsed time       = 128.903 seconds.
17 | user cpu time            = 2625.732 seconds.
18 | system time              = 271.419 seconds.
19 | max resident set size    = 1870.316 MBytes.
20 | 
21 | Rank 7 reported the largest memory utilization : 1961.68 MBytes
22 | Rank 0 reported the largest elapsed time : 128.90 sec
23 | 
24 | -----------------------------------------------------------------
25 | Message size distributions:
26 | 
27 | MPI_Bcast                 #calls    avg. bytes      time(sec)
28 |                               50           4.0          0.005
29 |                                2           8.0          0.000
30 |                               39          25.0          0.000
31 |                                6          40.0          0.000
32 |                                6         128.0          0.000
33 |                                7         245.9          0.000
34 |                               11         406.3          0.000
35 |                                1         553.0          0.000
36 |                                8        1193.2          0.000
37 |                                2        2829.0          0.000
38 |                                6        5120.0          0.000
39 |                                6       73728.0          0.002
40 |                                6     4718592.0          0.019
41 | 
42 | MPI_Allreduce             #calls    avg. bytes      time(sec)
43 |                              102           4.0          0.001
44 |                            72893          16.0         89.051
45 |                               16          40.0          0.003
46 |                              709         128.0          0.096
47 |                              709         256.0          0.182
48 |                               10         512.0          0.004
49 |                               76         552.0          0.056
50 |                              713        1154.2          0.068
51 |                              684        5607.9          0.627
52 |                              714       73730.2          0.176
53 |                              714     4718824.9         18.831
54 | 
55 | MPI_Gather                #calls    avg. bytes      time(sec)
56 |                               50           4.0          0.000
57 | 
58 | MPI_Gatherv               #calls    avg. bytes      time(sec)
59 |                               40          25.0          0.000
60 |                                1         121.0          0.000
61 |                                2         237.0          0.000
62 |                                2         345.0          0.000
63 |                                2         809.0          0.000
64 |                                1        1041.0          0.000
65 |                                2        2213.0          0.000
66 | 
67 | MPI_Allgather             #calls    avg. bytes      time(sec)
68 |                                2           4.0          0.000
69 | 
70 | 


--------------------------------------------------------------------------------
/04_distributedLearning/Horovod/mpitrace/gpu/mpi_profile.2497205.0:
--------------------------------------------------------------------------------
 1 | Data for MPI rank 0 of 8:
 2 | Times and statistics from MPI_Init() to MPI_Finalize().
 3 | -----------------------------------------------------------------------
 4 | MPI Routine                        #calls     avg. bytes      time(sec)
 5 | -----------------------------------------------------------------------
 6 | MPI_Comm_rank                           5            0.0          0.000
 7 | MPI_Comm_size                           3            0.0          0.000
 8 | MPI_Bcast                             111       259530.7          0.019
 9 | MPI_Barrier                             1            0.0          0.081
10 | MPI_Allreduce                       12506           15.9          5.887
11 | MPI_Gather                             30            4.0          0.006
12 | MPI_Gatherv                            30            0.0          0.000
13 | MPI_Allgather                           2            4.0          0.000
14 | -----------------------------------------------------------------
15 | total communication time = 5.994 seconds.
16 | total elapsed time       = 18.068 seconds.
17 | user cpu time            = 19.887 seconds.
18 | system time              = 16.860 seconds.
19 | max resident set size    = 4690.930 MBytes.
20 | 
21 | Rank 2 reported the largest memory utilization : 4738.88 MBytes
22 | Rank 5 reported the largest elapsed time : 18.19 sec
23 | 
24 | -----------------------------------------------------------------
25 | Message size distributions:
26 | 
27 | MPI_Bcast                 #calls    avg. bytes      time(sec)
28 |                               30           4.0          0.000
29 |                                2           8.0          0.000
30 |                               20          25.0          0.000
31 |                                6          40.0          0.000
32 |                                7         128.0          0.000
33 |                                8         241.2          0.000
34 |                                8         476.2          0.000
35 |                                3         793.0          0.000
36 |                                7        1149.9          0.000
37 |                                2        2665.0          0.000
38 |                                6        5120.0          0.000
39 |                                6       73728.0          0.003
40 |                                6     4718592.0          0.015
41 | 
42 | MPI_Allreduce             #calls    avg. bytes      time(sec)
43 |                              102           4.0          0.001
44 |                            12404          16.0          5.885
45 | 
46 | MPI_Gather                #calls    avg. bytes      time(sec)
47 |                               30           4.0          0.006
48 | 
49 | MPI_Allgather             #calls    avg. bytes      time(sec)
50 |                                2           4.0          0.000
51 | 
52 | -----------------------------------------------------------------
53 | 
54 | Communication summary for all tasks:
55 | 
56 |   minimum communication time = 3.775 sec for task 5
57 |   median  communication time = 5.787 sec for task 6
58 |   maximum communication time = 6.732 sec for task 2
59 | 
60 | 
61 | MPI timing summary for all ranks:
62 | taskid       hostname     comm(s)  elapsed(s)     user(s)   system(s)    size(MB)    switches
63 |      0     thetagpu23       5.99       18.07       19.89       16.86     4690.93      102621
64 |      1     thetagpu23       6.57       18.19       16.87        2.96     4687.09      102385
65 |      2     thetagpu23       6.73       18.19       16.84        3.12     4738.88      100802
66 |      3     thetagpu23       6.59       18.19       16.60        3.16     4704.07      101685
67 |      4     thetagpu23       4.68       18.19       14.43        3.34     4692.63      102432
68 |      5     thetagpu23       3.77       18.19       13.95        3.24     4679.32      100831
69 |      6     thetagpu23       5.79       18.19       16.15        2.98     4687.98      102823
70 |      7     thetagpu23       4.49       18.07       14.63        3.16     4684.28      102118
71 | 


--------------------------------------------------------------------------------
/04_distributedLearning/Horovod/mpitrace/gpu/mpi_profile.2497205.2:
--------------------------------------------------------------------------------
 1 | Data for MPI rank 2 of 8:
 2 | Times and statistics from MPI_Init() to MPI_Finalize().
 3 | -----------------------------------------------------------------------
 4 | MPI Routine                        #calls     avg. bytes      time(sec)
 5 | -----------------------------------------------------------------------
 6 | MPI_Comm_rank                           5            0.0          0.000
 7 | MPI_Comm_size                           3            0.0          0.000
 8 | MPI_Bcast                             111       259530.7          0.126
 9 | MPI_Barrier                             1            0.0          0.001
10 | MPI_Allreduce                       12506           15.9          6.606
11 | MPI_Gather                             30            4.0          0.000
12 | MPI_Gatherv                            30          295.4          0.000
13 | MPI_Allgather                           2            4.0          0.000
14 | -----------------------------------------------------------------
15 | MPI task 2 of 8 had the maximum communication time.
16 | total communication time = 6.732 seconds.
17 | total elapsed time       = 18.190 seconds.
18 | user cpu time            = 16.840 seconds.
19 | system time              = 3.117 seconds.
20 | max resident set size    = 4738.883 MBytes.
21 | 
22 | Rank 2 reported the largest memory utilization : 4738.88 MBytes
23 | Rank 5 reported the largest elapsed time : 18.19 sec
24 | 
25 | -----------------------------------------------------------------
26 | Message size distributions:
27 | 
28 | MPI_Bcast                 #calls    avg. bytes      time(sec)
29 |                               30           4.0          0.005
30 |                                2           8.0          0.000
31 |                               20          25.0          0.000
32 |                                6          40.0          0.000
33 |                                7         128.0          0.101
34 |                                8         241.2          0.000
35 |                                8         476.2          0.000
36 |                                3         793.0          0.000
37 |                                7        1149.9          0.000
38 |                                2        2665.0          0.000
39 |                                6        5120.0          0.000
40 |                                6       73728.0          0.003
41 |                                6     4718592.0          0.016
42 | 
43 | MPI_Allreduce             #calls    avg. bytes      time(sec)
44 |                              102           4.0          0.001
45 |                            12404          16.0          6.604
46 | 
47 | MPI_Gather                #calls    avg. bytes      time(sec)
48 |                               30           4.0          0.000
49 | 
50 | MPI_Gatherv               #calls    avg. bytes      time(sec)
51 |                               20          25.0          0.000
52 |                                1         121.0          0.000
53 |                                1         161.0          0.000
54 |                                2         357.0          0.000
55 |                                3         723.7          0.000
56 |                                2        1509.0          0.000
57 |                                1        2177.0          0.000
58 | 
59 | MPI_Allgather             #calls    avg. bytes      time(sec)
60 |                                2           4.0          0.000
61 | 
62 | 


--------------------------------------------------------------------------------
/04_distributedLearning/Horovod/mpitrace/gpu/mpi_profile.2497205.5:
--------------------------------------------------------------------------------
 1 | Data for MPI rank 5 of 8:
 2 | Times and statistics from MPI_Init() to MPI_Finalize().
 3 | -----------------------------------------------------------------------
 4 | MPI Routine                        #calls     avg. bytes      time(sec)
 5 | -----------------------------------------------------------------------
 6 | MPI_Comm_rank                           5            0.0          0.000
 7 | MPI_Comm_size                           3            0.0          0.000
 8 | MPI_Bcast                             111       259530.7          0.128
 9 | MPI_Barrier                             1            0.0          0.000
10 | MPI_Allreduce                       12506           15.9          3.645
11 | MPI_Gather                             30            4.0          0.000
12 | MPI_Gatherv                            30          295.7          0.000
13 | MPI_Allgather                           2            4.0          0.000
14 | -----------------------------------------------------------------
15 | MPI task 5 of 8 had the minimum communication time.
16 | total communication time = 3.775 seconds.
17 | total elapsed time       = 18.190 seconds.
18 | user cpu time            = 13.953 seconds.
19 | system time              = 3.244 seconds.
20 | max resident set size    = 4679.320 MBytes.
21 | 
22 | Rank 2 reported the largest memory utilization : 4738.88 MBytes
23 | Rank 5 reported the largest elapsed time : 18.19 sec
24 | 
25 | -----------------------------------------------------------------
26 | Message size distributions:
27 | 
28 | MPI_Bcast                 #calls    avg. bytes      time(sec)
29 |                               30           4.0          0.007
30 |                                2           8.0          0.000
31 |                               20          25.0          0.000
32 |                                6          40.0          0.000
33 |                                7         128.0          0.101
34 |                                8         241.2          0.000
35 |                                8         476.2          0.000
36 |                                3         793.0          0.000
37 |                                7        1149.9          0.000
38 |                                2        2665.0          0.000
39 |                                6        5120.0          0.000
40 |                                6       73728.0          0.003
41 |                                6     4718592.0          0.017
42 | 
43 | MPI_Allreduce             #calls    avg. bytes      time(sec)
44 |                              102           4.0          0.001
45 |                            12404          16.0          3.644
46 | 
47 | MPI_Gather                #calls    avg. bytes      time(sec)
48 |                               30           4.0          0.000
49 | 
50 | MPI_Gatherv               #calls    avg. bytes      time(sec)
51 |                               20          25.0          0.000
52 |                                1         121.0          0.000
53 |                                3         339.7          0.000
54 |                                3         798.3          0.000
55 |                                3        1611.7          0.000
56 | 
57 | MPI_Allgather             #calls    avg. bytes      time(sec)
58 |                                2           4.0          0.000
59 | 
60 | 


--------------------------------------------------------------------------------
/04_distributedLearning/Horovod/mpitrace/gpu/mpi_profile.2497205.6:
--------------------------------------------------------------------------------
 1 | Data for MPI rank 6 of 8:
 2 | Times and statistics from MPI_Init() to MPI_Finalize().
 3 | -----------------------------------------------------------------------
 4 | MPI Routine                        #calls     avg. bytes      time(sec)
 5 | -----------------------------------------------------------------------
 6 | MPI_Comm_rank                           5            0.0          0.000
 7 | MPI_Comm_size                           3            0.0          0.000
 8 | MPI_Bcast                             111       259530.7          0.127
 9 | MPI_Barrier                             1            0.0          0.001
10 | MPI_Allreduce                       12506           15.9          5.658
11 | MPI_Gather                             30            4.0          0.000
12 | MPI_Gatherv                            30          293.8          0.000
13 | MPI_Allgather                           2            4.0          0.000
14 | -----------------------------------------------------------------
15 | MPI task 6 of 8 had the median communication time.
16 | total communication time = 5.787 seconds.
17 | total elapsed time       = 18.190 seconds.
18 | user cpu time            = 16.145 seconds.
19 | system time              = 2.975 seconds.
20 | max resident set size    = 4687.980 MBytes.
21 | 
22 | Rank 2 reported the largest memory utilization : 4738.88 MBytes
23 | Rank 5 reported the largest elapsed time : 18.19 sec
24 | 
25 | -----------------------------------------------------------------
26 | Message size distributions:
27 | 
28 | MPI_Bcast                 #calls    avg. bytes      time(sec)
29 |                               30           4.0          0.005
30 |                                2           8.0          0.000
31 |                               20          25.0          0.000
32 |                                6          40.0          0.000
33 |                                7         128.0          0.101
34 |                                8         241.2          0.000
35 |                                8         476.2          0.000
36 |                                3         793.0          0.000
37 |                                7        1149.9          0.000
38 |                                2        2665.0          0.000
39 |                                6        5120.0          0.000
40 |                                6       73728.0          0.003
41 |                                6     4718592.0          0.017
42 | 
43 | MPI_Allreduce             #calls    avg. bytes      time(sec)
44 |                              102           4.0          0.001
45 |                            12404          16.0          5.657
46 | 
47 | MPI_Gather                #calls    avg. bytes      time(sec)
48 |                               30           4.0          0.000
49 | 
50 | MPI_Gatherv               #calls    avg. bytes      time(sec)
51 |                               21          25.0          0.000
52 |                                1         121.0          0.000
53 |                                3         369.0          0.000
54 |                                2         805.0          0.000
55 |                                1        1049.0          0.000
56 |                                2        2201.0          0.000
57 | 
58 | MPI_Allgather             #calls    avg. bytes      time(sec)
59 |                                2           4.0          0.000
60 | 
61 | 


--------------------------------------------------------------------------------
/04_distributedLearning/figures/Horovod.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/04_distributedLearning/figures/Horovod.png


--------------------------------------------------------------------------------
/04_distributedLearning/figures/cpu_horovodtimeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/04_distributedLearning/figures/cpu_horovodtimeline.png


--------------------------------------------------------------------------------
/04_distributedLearning/figures/distributed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/04_distributedLearning/figures/distributed.png


--------------------------------------------------------------------------------
/04_distributedLearning/figures/gpu_horovodtimeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/04_distributedLearning/figures/gpu_horovodtimeline.png


--------------------------------------------------------------------------------
/04_distributedLearning/figures/resnet50.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/04_distributedLearning/figures/resnet50.png


--------------------------------------------------------------------------------
/04_distributedLearning/results/concise_1.out:
--------------------------------------------------------------------------------
 1 | I am rank 0 of 1
 2 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440
 3 |   warnings.warn(
 4 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0036s vs `on_train_batch_end` time: 0.0357s). Check your callbacks.
 5 | 117/117 - 7s - loss: 0.3079 - accuracy: 0.9042
 6 | Epoch 1/20
 7 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0036s vs `on_train_batch_end` time: 0.0189s). Check your callbacks.
 8 | 117/117 - 1s - loss: 0.2835 - accuracy: 0.9120
 9 | Epoch 2/20
10 | 117/117 - 1s - loss: 0.0888 - accuracy: 0.9743
11 | Epoch 3/20
12 | 117/117 - 1s - loss: 0.0683 - accuracy: 0.9793
13 | 
14 | Epoch 3: finished gradual learning rate warmup to 0.01.
15 | Epoch 4/20
16 | 117/117 - 1s - loss: 0.0656 - accuracy: 0.9799
17 | Epoch 5/20
18 | 117/117 - 1s - loss: 0.0513 - accuracy: 0.9845
19 | Epoch 6/20
20 | 117/117 - 1s - loss: 0.0520 - accuracy: 0.9839
21 | Epoch 7/20
22 | 117/117 - 1s - loss: 0.0477 - accuracy: 0.9848
23 | Epoch 8/20
24 | 117/117 - 1s - loss: 0.0423 - accuracy: 0.9864
25 | Epoch 9/20
26 | 117/117 - 1s - loss: 0.0409 - accuracy: 0.9870
27 | Epoch 10/20
28 | 117/117 - 1s - loss: 0.0389 - accuracy: 0.9879
29 | Epoch 11/20
30 | 117/117 - 1s - loss: 0.0381 - accuracy: 0.9880
31 | Epoch 12/20
32 | 117/117 - 1s - loss: 0.0406 - accuracy: 0.9874
33 | Epoch 13/20
34 | 117/117 - 1s - loss: 0.0416 - accuracy: 0.9872
35 | Epoch 14/20
36 | 117/117 - 1s - loss: 0.0351 - accuracy: 0.9892
37 | Epoch 15/20
38 | 117/117 - 1s - loss: 0.0356 - accuracy: 0.9887
39 | Epoch 16/20
40 | 117/117 - 1s - loss: 0.0398 - accuracy: 0.9875
41 | Epoch 17/20
42 | 117/117 - 1s - loss: 0.0395 - accuracy: 0.9881
43 | Epoch 18/20
44 | 117/117 - 1s - loss: 0.0371 - accuracy: 0.9884
45 | Epoch 19/20
46 | 117/117 - 1s - loss: 0.0361 - accuracy: 0.9888
47 | Epoch 20/20
48 | 117/117 - 1s - loss: 0.0411 - accuracy: 0.9876
49 | Total time: 13.148040294647217 second
50 | 


--------------------------------------------------------------------------------
/04_distributedLearning/results/concise_2.out:
--------------------------------------------------------------------------------
 1 | I am rank 1 of 2
 2 | I am rank 0 of 2
 3 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440
 4 |   warnings.warn(
 5 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440
 6 |   warnings.warn(
 7 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0071s vs `on_train_batch_end` time: 0.0387s). Check your callbacks.
 8 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0069s vs `on_train_batch_end` time: 0.0388s). Check your callbacks.
 9 | 58/58 - 8s - loss: 0.3714 - accuracy: 0.8798
10 | Epoch 1/20
11 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0060s vs `on_train_batch_end` time: 0.0178s). Check your callbacks.
12 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0058s vs `on_train_batch_end` time: 0.0180s). Check your callbacks.
13 | 58/58 - 1s - loss: 0.4186 - accuracy: 0.8662
14 | Epoch 2/20
15 | 58/58 - 0s - loss: 0.0983 - accuracy: 0.9711
16 | Epoch 3/20
17 | 58/58 - 0s - loss: 0.0706 - accuracy: 0.9788
18 | 
19 | Epoch 3: finished gradual learning rate warmup to 0.01.
20 | Epoch 4/20
21 | 58/58 - 0s - loss: 0.0641 - accuracy: 0.9796
22 | Epoch 5/20
23 | 58/58 - 0s - loss: 0.0584 - accuracy: 0.9826
24 | Epoch 6/20
25 | 58/58 - 0s - loss: 0.0485 - accuracy: 0.9840
26 | Epoch 7/20
27 | 58/58 - 0s - loss: 0.0398 - accuracy: 0.9872
28 | Epoch 8/20
29 | 58/58 - 0s - loss: 0.0426 - accuracy: 0.9869
30 | Epoch 9/20
31 | 58/58 - 0s - loss: 0.0432 - accuracy: 0.9863
32 | Epoch 10/20
33 | 58/58 - 0s - loss: 0.0350 - accuracy: 0.9885
34 | Epoch 11/20
35 | 58/58 - 0s - loss: 0.0322 - accuracy: 0.9898
36 | Epoch 12/20
37 | 58/58 - 0s - loss: 0.0269 - accuracy: 0.9908
38 | Epoch 13/20
39 | 58/58 - 0s - loss: 0.0317 - accuracy: 0.9898
40 | Epoch 14/20
41 | 58/58 - 0s - loss: 0.0285 - accuracy: 0.9905
42 | Epoch 15/20
43 | 58/58 - 0s - loss: 0.0276 - accuracy: 0.9908
44 | Epoch 16/20
45 | 58/58 - 0s - loss: 0.0280 - accuracy: 0.9904
46 | Epoch 17/20
47 | 58/58 - 0s - loss: 0.0247 - accuracy: 0.9921
48 | Epoch 18/20
49 | 58/58 - 0s - loss: 0.0276 - accuracy: 0.9911
50 | Epoch 19/20
51 | 58/58 - 0s - loss: 0.0266 - accuracy: 0.9914
52 | Epoch 20/20
53 | 58/58 - 0s - loss: 0.0234 - accuracy: 0.9922
54 | Total time: 8.65635347366333 second
55 | 


--------------------------------------------------------------------------------
/04_distributedLearning/results/concise_4.out:
--------------------------------------------------------------------------------
 1 | I am rank 0 of 4
 2 | I am rank 1 of 4
 3 | I am rank 2 of 4
 4 | I am rank 3 of 4
 5 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440
 6 |   warnings.warn(
 7 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440
 8 |   warnings.warn(
 9 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440
10 |   warnings.warn(
11 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440
12 |   warnings.warn(
13 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0054s vs `on_train_batch_end` time: 0.0286s). Check your callbacks.
14 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0053s vs `on_train_batch_end` time: 0.0286s). Check your callbacks.
15 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0054s vs `on_train_batch_end` time: 0.0285s). Check your callbacks.
16 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0054s vs `on_train_batch_end` time: 0.0285s). Check your callbacks.
17 | 29/29 - 7s - loss: 0.6509 - accuracy: 0.7879
18 | Epoch 1/20
19 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0055s vs `on_train_batch_end` time: 0.0151s). Check your callbacks.
20 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0055s vs `on_train_batch_end` time: 0.0151s). Check your callbacks.
21 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0054s vs `on_train_batch_end` time: 0.0151s). Check your callbacks.
22 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0055s vs `on_train_batch_end` time: 0.0151s). Check your callbacks.
23 | 29/29 - 1s - loss: 0.6974 - accuracy: 0.7748
24 | Epoch 2/20
25 | 29/29 - 0s - loss: 0.1578 - accuracy: 0.9548
26 | Epoch 3/20
27 | 29/29 - 0s - loss: 0.0885 - accuracy: 0.9740
28 | 
29 | Epoch 3: finished gradual learning rate warmup to 0.01.
30 | Epoch 4/20
31 | 29/29 - 0s - loss: 0.0667 - accuracy: 0.9806
32 | Epoch 5/20
33 | 29/29 - 0s - loss: 0.0674 - accuracy: 0.9795
34 | Epoch 6/20
35 | 29/29 - 0s - loss: 0.0519 - accuracy: 0.9839
36 | Epoch 7/20
37 | 29/29 - 0s - loss: 0.0482 - accuracy: 0.9857
38 | Epoch 8/20
39 | 29/29 - 0s - loss: 0.0408 - accuracy: 0.9874
40 | Epoch 9/20
41 | 29/29 - 0s - loss: 0.0349 - accuracy: 0.9899
42 | Epoch 10/20
43 | 29/29 - 0s - loss: 0.0336 - accuracy: 0.9883
44 | Epoch 11/20
45 | 29/29 - 0s - loss: 0.0308 - accuracy: 0.9902
46 | Epoch 12/20
47 | 29/29 - 0s - loss: 0.0275 - accuracy: 0.9911
48 | Epoch 13/20
49 | 29/29 - 0s - loss: 0.0234 - accuracy: 0.9927
50 | Epoch 14/20
51 | 29/29 - 0s - loss: 0.0218 - accuracy: 0.9925
52 | Epoch 15/20
53 | 29/29 - 0s - loss: 0.0255 - accuracy: 0.9917
54 | Epoch 16/20
55 | 29/29 - 0s - loss: 0.0284 - accuracy: 0.9906
56 | Epoch 17/20
57 | 29/29 - 0s - loss: 0.0203 - accuracy: 0.9935
58 | Epoch 18/20
59 | 29/29 - 0s - loss: 0.0240 - accuracy: 0.9923
60 | Epoch 19/20
61 | 29/29 - 0s - loss: 0.0189 - accuracy: 0.9935
62 | Epoch 20/20
63 | 29/29 - 0s - loss: 0.0203 - accuracy: 0.9924
64 | Total time: 3.7076730728149414 second
65 | 


--------------------------------------------------------------------------------
/04_distributedLearning/results/concise_8.out:
--------------------------------------------------------------------------------
 1 | I am rank 4 of 8
 2 | I am rank 0 of 8
 3 | I am rank 1 of 8
 4 | I am rank 2 of 8
 5 | I am rank 3 of 8
 6 | I am rank 5 of 8
 7 | I am rank 6 of 8
 8 | I am rank 7 of 8
 9 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440
10 |   warnings.warn(
11 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440
12 |   warnings.warn(
13 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440
14 |   warnings.warn(
15 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440
16 |   warnings.warn(
17 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440
18 |   warnings.warn(
19 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440
20 |   warnings.warn(
21 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440
22 |   warnings.warn(
23 | /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/lib/python3.8/site-packages/horovod/_keras/callbacks.py:58: UserWarning: Some callbacks may not have access to the averaged metrics, see https://github.com/horovod/horovod/issues/2440
24 |   warnings.warn(
25 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0060s vs `on_train_batch_end` time: 0.0308s). Check your callbacks.
26 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0059s vs `on_train_batch_end` time: 0.0309s). Check your callbacks.
27 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0059s vs `on_train_batch_end` time: 0.0308s). Check your callbacks.
28 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0062s vs `on_train_batch_end` time: 0.0308s). Check your callbacks.
29 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0061s vs `on_train_batch_end` time: 0.0307s). Check your callbacks.
30 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0061s vs `on_train_batch_end` time: 0.0308s). Check your callbacks.
31 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0060s vs `on_train_batch_end` time: 0.0308s). Check your callbacks.
32 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0061s vs `on_train_batch_end` time: 0.0308s). Check your callbacks.
33 | 14/14 - 11s - loss: 1.0856 - accuracy: 0.6493
34 | Epoch 1/20
35 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0057s vs `on_train_batch_end` time: 0.0170s). Check your callbacks.
36 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0056s vs `on_train_batch_end` time: 0.0169s). Check your callbacks.
37 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0057s vs `on_train_batch_end` time: 0.0169s). Check your callbacks.
38 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0056s vs `on_train_batch_end` time: 0.0169s). Check your callbacks.
39 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0057s vs `on_train_batch_end` time: 0.0170s). Check your callbacks.
40 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0057s vs `on_train_batch_end` time: 0.0169s). Check your callbacks.
41 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0057s vs `on_train_batch_end` time: 0.0169s). Check your callbacks.
42 | WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0057s vs `on_train_batch_end` time: 0.0170s). Check your callbacks.
43 | 14/14 - 1s - loss: 1.1651 - accuracy: 0.6348
44 | Epoch 2/20
45 | 14/14 - 0s - loss: 0.3157 - accuracy: 0.9061
46 | Epoch 3/20
47 | 14/14 - 0s - loss: 0.1135 - accuracy: 0.9653
48 | 
49 | Epoch 3: finished gradual learning rate warmup to 0.01.
50 | Epoch 4/20
51 | 14/14 - 0s - loss: 0.1007 - accuracy: 0.9700
52 | Epoch 5/20
53 | 14/14 - 0s - loss: 0.0634 - accuracy: 0.9817
54 | Epoch 6/20
55 | 14/14 - 0s - loss: 0.0586 - accuracy: 0.9802
56 | Epoch 7/20
57 | 14/14 - 0s - loss: 0.0530 - accuracy: 0.9859
58 | Epoch 8/20
59 | 14/14 - 0s - loss: 0.0452 - accuracy: 0.9859
60 | Epoch 9/20
61 | 14/14 - 0s - loss: 0.0441 - accuracy: 0.9847
62 | Epoch 10/20
63 | 14/14 - 0s - loss: 0.0330 - accuracy: 0.9891
64 | Epoch 11/20
65 | 14/14 - 0s - loss: 0.0326 - accuracy: 0.9886
66 | Epoch 12/20
67 | 14/14 - 0s - loss: 0.0316 - accuracy: 0.9895
68 | Epoch 13/20
69 | 14/14 - 0s - loss: 0.0284 - accuracy: 0.9914
70 | Epoch 14/20
71 | 14/14 - 0s - loss: 0.0311 - accuracy: 0.9894
72 | Epoch 15/20
73 | 14/14 - 0s - loss: 0.0257 - accuracy: 0.9919
74 | Epoch 16/20
75 | 14/14 - 0s - loss: 0.0306 - accuracy: 0.9901
76 | Epoch 17/20
77 | 14/14 - 0s - loss: 0.0290 - accuracy: 0.9899
78 | Epoch 18/20
79 | 14/14 - 0s - loss: 0.0247 - accuracy: 0.9923
80 | Epoch 19/20
81 | 14/14 - 0s - loss: 0.0271 - accuracy: 0.9914
82 | Epoch 20/20
83 | 14/14 - 0s - loss: 0.0222 - accuracy: 0.9926
84 | Total time: 2.2868692874908447 second
85 | 


--------------------------------------------------------------------------------
/04_distributedLearning/submissions/qsub_polaris.sc:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #PBS -l walltime=00:30:00
 3 | #PBS -l nodes=4:ppn=4
 4 | #PBS -N atpesc_horovod
 5 | #PBS -k doe
 6 | #PBS -j oe
 7 | #PBS -A ATPESC_2024
 8 | 
 9 | module use /soft/modulefiles
10 | module load conda
11 | conda activate
12 | NODES=$(sort ${PBS_NODEFILE} | uniq -c | sort -n | wc -l)
13 | GPUS_PER_NODE=4
14 | RANKS=$((NODES * GPUS_PER_NODE))
15 | echo NODES=$NODES  PPN=$GPUS_PER_NODE  RANKS=$RANKS
16 | 
17 | aprun -n 1 -N 1 python Horovod/04_keras_cnn_concise_hvd.py 
18 | aprun -n 2 -N 2 python Horovod/04_keras_cnn_concise_hvd.py 
19 | aprun -n 4 -N 4 python Horovod/04_keras_cnn_concise_hvd.py 
20 | aprun -n 8 -N 4 python Horovod/04_keras_cnn_concise_hvd.py 
21 | aprun -n 16 -N 4 python Horovod/04_keras_cnn_concise_hvd.py 
22 | 


--------------------------------------------------------------------------------
/04_distributedLearning/submissions/qsub_thetagpu.sc:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #COBALT -n 1
 3 | #COBALT -t 0:15:00
 4 | #COBALT -q training-gpu 
 5 | #COBALT -A ATPESC2023
 6 | 
 7 | 
 8 | # source /lus/theta-fs0/software/thetagpu/conda/2021-06-26/mconda3/setup.sh
 9 | 
10 | module load conda/2022-07-01; conda activate
11 | export http_proxy=http://theta-proxy.tmi.alcf.anl.gov:3128
12 | export https_proxy=https://theta-proxy.tmi.alcf.anl.gov:3128
13 | 
14 | mpirun -np 1 python Horovod/04_keras_cnn_concise_hvd.py >& concise_1.out.gpu
15 | mpirun -np 2 python Horovod/04_keras_cnn_concise_hvd.py >& concise_2.out.gpu
16 | mpirun -np 4 python Horovod/04_keras_cnn_concise_hvd.py >& concise_4.out.gpu
17 | mpirun -np 8 python Horovod/04_keras_cnn_concise_hvd.py >& concise_8.out.gpu
18 | 
19 | 
20 | HOROVOD_TIMELINE=gpu.json LD_PRELOAD=/soft/perftools/hpctw/lib/libmpitrace.so mpirun -np 8 python Horovod/04_keras_cnn_concise_hvd.py 
21 | HOROVOD_TIMELINE=cpu.json LD_PRELOAD=/soft/perftools/hpctw/lib/libmpitrace.so mpirun -np 8 python Horovod/04_keras_cnn_concise_hvd.py  --device cpu
22 | 
23 | 


--------------------------------------------------------------------------------
/05_aiTestbed/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/05_aiTestbed/.DS_Store


--------------------------------------------------------------------------------
/05_aiTestbed/Cerebras/Cerebras_Wafer-Scale_Cluster_login_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/05_aiTestbed/Cerebras/Cerebras_Wafer-Scale_Cluster_login_diagram.png


--------------------------------------------------------------------------------
/05_aiTestbed/Cerebras/README.md:
--------------------------------------------------------------------------------
 1 | # Cerebras 
 2 | 
 3 | ## Connection to a CS-2 node
 4 | 
 5 | Connection to one of the CS-2 cluster login nodes requires an MFA passcode for authentication - either an 8-digit passcode generated by an app on your mobile device (e.g. MobilePASS+) or a CRYPTOCard-generated passcode prefixed by a 4-digit pin. 
 6 | 
 7 | ![CS-2 connection diagram](./Cerebras_Wafer-Scale_Cluster_login_diagram.png)
 8 | 
 9 | To connect to a CS-2 login, ssh to login nodes:
10 | ```bash
11 | ssh ALCFUserID@cerebras.ai.alcf.anl.gov
12 | ```
13 | 
14 | ## Create Virtual Environment 
15 | 
16 | ### PyTorch virtual environment
17 | Create a PyTorch virtual environment for Cerebras
18 | ```bash
19 | mkdir ~/R_2.3.0
20 | cd ~/R_2.3.0
21 | # Note: "deactivate" does not actually work in scripts.
22 | deactivate
23 | rm -r venv_cerebras_pt
24 | /software/cerebras/python3.8/bin/python3.8 -m venv venv_cerebras_pt
25 | source venv_cerebras_pt/bin/activate
26 | pip install --upgrade pip
27 | pip install cerebras_pytorch==2.3.0
28 | ```
29 | ## Clone Cerebras modelzoo
30 | 
31 | We use example from [Cerebras Modelzoo repository](https://github.com/Cerebras/modelzoo) for this hands-on. 
32 | Clone the modezoo repository.<br>
33 | 
34 | ```bash
35 | mkdir ~/R_2.3.0
36 | cd ~/R_2.3.0
37 | git clone https://github.com/Cerebras/modelzoo.git
38 | cd modelzoo
39 | git tag
40 | git checkout Release_2.3.0
41 | ```
42 | 
43 | ## Job Queuing and Submission
44 | 
45 | The CS-2 cluster has its own Kubernetes-based system for job submission and queuing. Jobs are started automatically through the Python scripts. 
46 | 
47 | Use Cerebras cluster command line tool to get addional information about the jobs.
48 | 
49 | * Jobs that have not yet completed can be listed as
50 |     `(venv_pt) $ csctl get jobs`
51 | * Jobs can be canceled as shown:
52 |     `(venv_tf) $ csctl cancel job wsjob-eyjapwgnycahq9tus4w7id`
53 | 
54 | See `csctl -h` for more options.
55 | 
56 | ## Run Examples
57 | 
58 | Refer to the instructions in the file below to run GPT-J model.
59 | <!--- * [FC-MNIST](./fc-mnist.md)
60 | * [BERT-Large](./bert-large.md) --->
61 | * [GPT-J](./gpt-j.md)
62 | 
63 | # Useful Resources 
64 | 
65 | * [ALCF Cerebras Documentation](https://docs.alcf.anl.gov/ai-testbed/cerebras/system-overview/)
66 | * [Cerebras Documntation](https://docs.cerebras.net/en/latest/wsc/index.html)
67 | * [Cerebras Modelzoo Repo](https://github.com/Cerebras/modelzoo/tree/main/modelzoo)
68 | * Datasets Path: `/software/cerebras/dataset`
69 | 


--------------------------------------------------------------------------------
/05_aiTestbed/Cerebras/gpt-j.md:
--------------------------------------------------------------------------------
 1 | # Gpt-J on Cerebras
 2 | 
 3 | Go to the directory with the GPT-J example. 
 4 | ```bash
 5 | cd ~/R_2.3.0/modelzoo/src/cerebras/modelzoo/models/nlp/gptj
 6 | ```
 7 | 
 8 | Activate PyTroch virtual Environment 
 9 | ```bash
10 | source ~/R_2.3.0/venv_cerebras_pt/bin/activate
11 | pip install -r ~/R_2.3.0/modelzoo/requirements.txt
12 | ```
13 | 
14 | Replace config file with correct configurations file. 
15 | ```bash
16 | cp /software/cerebras/dataset/gptj/params_gptj_6B_sampleds.yaml configs/params_gptj_6B_sampleds.yaml
17 | ```
18 | 
19 | Run Training Job
20 | ```bash
21 | export MODEL_DIR=model_dir_gptj
22 | if [ -d "$MODEL_DIR" ]; then rm -Rf $MODEL_DIR; fi
23 | python run.py CSX --job_labels name=gptj_pt --params configs/params_gptj_6B_sampleds.yaml --num_csx=2 --mode train --model_dir $MODEL_DIR --mount_dirs /home/ /software --python_paths /home/$(whoami)/R_2.3.0/modelzoo/src --compile_dir $(whoami) |& tee mytest.log
24 | ```
25 | <details>
26 |   <summary>Sample Output (last section)</summary>
27 |   
28 |   ```bash
29 |      2023-11-29 20:59:19,223 INFO:   Beginning appliance run
30 | 2023-11-29 21:03:53,875 INFO:   | Train Device=CSX, Step=100, Loss=8.43750, Rate=43.70 samples/sec, GlobalRate=43.70 samples/sec
31 | 2023-11-29 21:08:28,779 INFO:   | Train Device=CSX, Step=200, Loss=8.12500, Rate=43.67 samples/sec, GlobalRate=43.67 samples/sec
32 | 2023-11-29 21:08:28,781 INFO:   Saving checkpoint at step 200
33 | 2023-11-29 21:13:56,695 INFO:   Saved checkpoint model_dir_gptj/checkpoint_200.mdl
34 | 2023-11-29 21:14:30,135 INFO:   Heartbeat thread stopped for wsjob-kd4olqkhu6ya8qqzt88utd.
35 | 2023-11-29 21:14:30,142 INFO:   Training completed successfully!
36 | 2023-11-29 21:14:30,142 INFO:   Processed 24000 sample(s) in 910.883781998 seconds.
37 |   ```
38 | </details>
39 | 


--------------------------------------------------------------------------------
/05_aiTestbed/Graphcore/README.md:
--------------------------------------------------------------------------------
  1 | # Graphcore 
  2 | 
  3 | ## Connection to Graphcore 
  4 | 
  5 | ![Graphcore connection diagram](./graphcore_login.png)
  6 | 
  7 | Login to the Graphcore login node from your local machine.
  8 | Once you are on the login node, ssh to one of the Graphcore nodes.
  9 | 
 10 | ```bash
 11 | local > ssh ALCFUserID@gc-login-01.ai.alcf.anl.gov
 12 | # or
 13 | local > ssh ALCFUserID@gc-login-02.ai.alcf.anl.gov
 14 | ```
 15 | ```bash
 16 | login-01.ai.aclf.anl.gov > ssh gc-poplar-02.ai.alcf.anl.gov
 17 | # or
 18 | login-01.ai.aclf.anl.gov > ssh gc-poplar-03.ai.alcf.anl.gov
 19 | # or
 20 | login-01.ai.aclf.anl.gov > ssh gc-poplar-04.ai.alcf.anl.gov
 21 | ```
 22 | 
 23 | ## Create Virtual Environment 
 24 | 
 25 | ### PyTorch virtual environment
 26 | 
 27 | ```bash
 28 | mkdir -p ~/venvs/graphcore
 29 | virtualenv ~/venvs/graphcore/poptorch33_env
 30 | source ~/venvs/graphcore/poptorch33_env/bin/activate
 31 | 
 32 | POPLAR_SDK_ROOT=/software/graphcore/poplar_sdk/3.3.0
 33 | export POPLAR_SDK_ROOT=$POPLAR_SDK_ROOT
 34 | pip install $POPLAR_SDK_ROOT/poptorch-3.3.0+113432_960e9c294b_ubuntu_20_04-cp38-cp38-linux_x86_64.whl
 35 | ```
 36 | 
 37 | ### Tensorflow virtual environment
 38 | 
 39 | ```bash
 40 | virtualenv ~/venvs/graphcore/tensorflow2_33_env
 41 | source ~/venvs/graphcore/tensorflow2_33_env/bin/activate
 42 | 
 43 | POPLAR_SDK_ROOT=/software/graphcore/poplar_sdk/3.3.0
 44 | export POPLAR_SDK_ROOT=$POPLAR_SDK_ROOT
 45 | pip install $POPLAR_SDK_ROOT/tensorflow-2.6.3+gc3.3.0+251580+08d96978c7f+amd_znver1-cp38-cp38-linux_x86_64.whl
 46 | pip install $POPLAR_SDK_ROOT/keras-2.6.0+gc3.3.0+251582+a3785372-py2.py3-none-any.whl
 47 | ```
 48 | ## Miscellaneous Environment Variables
 49 | ```bash
 50 | mkdir ~/tmp
 51 | export TF_POPLAR_FLAGS=--executable_cache_path=~/tmp
 52 | export POPTORCH_CACHE_DIR=~/tmp
 53 | 
 54 | export POPART_LOG_LEVEL=WARN
 55 | export POPLAR_LOG_LEVEL=WARN
 56 | export POPLIBS_LOG_LEVEL=WARN
 57 | 
 58 | export PYTHONPATH=/software/graphcore/poplar_sdk/3.3.0/poplar-ubuntu_20_04-3.3.0+7857-b67b751185/python:$PYTHONPATH
 59 | ```
 60 | ## Clone Graphcore Examples
 61 | 
 62 | We use examples from [Graphcore Examples repository](https://github.com/graphcore/examples) for this hands-on. 
 63 | Clone the Graphcore Examples repository.
 64 | ```bash
 65 | mkdir ~/graphcore
 66 | cd ~/graphcore
 67 | git clone https://github.com/graphcore/examples.git
 68 | cd examples
 69 | git tag
 70 | git checkout v3.3.0
 71 | ```
 72 | 
 73 | ## Job Queuing and Submission
 74 | 
 75 | ALCF's Graphcore POD64 system uses Slurm for job submission and queueing. Below are some of the important commands for using Slurm.
 76 | 
 77 | * `srun` : The Slurm command `srun` can be used to run individual Python scripts. Use the --ipus= option to specify the number of IPUs required for the run.
 78 | e.g. `srun --ipus=1 python mnist_poptorch.py`
 79 | * `sbatch` : The jobs can be submitted to the Slurm workload manager through a batch script by using the `sbatch` command
 80 | * `squeue` : command provides information about jobs located in the Slurm scheduling queue.
 81 | * `sCancel` : is used to signal or cancel jobs, job arrays, or job steps.
 82 | 
 83 | ## Run Examples
 84 | 
 85 | Refer to respective instrcutions below 
 86 | <!--- * [MNIST](./mnist.md)
 87 | * [Resnet50 using replication factor](./resnet50.md) --->
 88 | * [GPT2](./gpt2.md)
 89 | 
 90 | ```bash
 91 | Note: Precompiled artifacts are present at the /software/graphcore/projects/models_compile location for the above models.
 92 | copy them to your ~/tmp and set export POPTORCH_CACHE_DIR=~/tmp to skip the compile process.
 93 | ```
 94 | ## Profiling 
 95 | 
 96 | We will use Pop Vision Graph Analyzer and System Analyzer to produce profiles. 
 97 | 
 98 | * [PopVision Graph Analyzer User Guide](https://docs.graphcore.ai/projects/graph-analyser-userguide/en/latest/)
 99 | * [PopVision System Analyzer User Guide](https://docs.graphcore.ai/projects/system-analyser-userguide/en/latest/)
100 | * [PopVision Tools Downloads](https://www.graphcore.ai/developer/popvision-tools#downloads) 
101 | 
102 | #### PopVision Graph Analyzer
103 | 
104 | To generate a profile for PopVision Graph Analyzer, run the executable with the following prefix
105 | 
106 | ```bash
107 | $ POPLAR_ENGINE_OPTIONS='{"autoReport.all":"true", "autoReport.directory":"./graph_profile", "profiler.includeFlopEstimates": "true"}' python mnist_poptorch.py
108 | ```
109 | 
110 | This will generate all the graph profiling reports along with flops estimates and save the output to the graph_profile directory.
111 | 
112 | To visualize the profiles, download generated profiles to a local machine and open them using PopVision Graph Analyzer. 
113 | 
114 | #### PopVision System Analyzer
115 | 
116 | To generate a profile for PopVision System Analyzer, run the executable with the following prefix
117 | 
118 | ```bash
119 | $ PVTI_OPTIONS='{"enable":"true", "directory": "./system_profile"}' python mnist_poptorch.py
120 | ```
121 | This will generate all the system profiling reports and save the output to system_profile directory.
122 | 
123 | To visualize the profiles, download generated profiles to a local machine and open them using PopVision Graph Analyzer. 
124 | 
125 | ## Useful Resources 
126 | 
127 | * [ALCF Graphcore Documentation](https://docs.alcf.anl.gov/ai-testbed/graphcore/system-overview/)
128 | * [Graphcore Documentation](https://docs.graphcore.ai/en/latest/)
129 | * [Graphcore Examples Repository](https://github.com/graphcore/examples)
130 | * Graphcore SDK Path: `/software/graphcore/poplar_sdk`
131 | 


--------------------------------------------------------------------------------
/05_aiTestbed/Graphcore/gpt2.md:
--------------------------------------------------------------------------------
 1 | # GPT2 on Graphcore 
 2 | 
 3 | These instructions are to train a GPT-2 pytorch model on the POD16. 
 4 | 
 5 | ##### Go to direcotry with GPT2 example 
 6 | ```bash
 7 | cd ~/graphcore/examples/nlp/gpt2/pytorch
 8 | ```
 9 | 
10 | ##### Create a new PopTorch Environment 
11 | ```bash
12 | POPLAR_SDK_ROOT=/software/graphcore/poplar_sdk/3.3.0/
13 | export POPLAR_SDK_ROOT=$POPLAR_SDK_ROOT
14 | 
15 | virtualenv ~/venvs/graphcore/poptorch33_gpt2
16 | source ~/venvs/graphcore/poptorch33_gpt2/bin/activate
17 | pip install $POPLAR_SDK_ROOT/poptorch-3.3.0+113432_960e9c294b_ubuntu_20_04-cp38-cp38-linux_x86_64.whl
18 | export PYTHONPATH=$POPLAR_SDK_ROOT/python:$PYTHONPATH
19 | ```
20 | 
21 | ##### Install Requirements 
22 | 
23 | ```bash
24 | pip3 install -r requirements.txt
25 | ```
26 | 
27 | ##### Run GPT2 on 4 IPUs (single Instance)
28 | 
29 | * Compile and Run from scratch
30 | ```bash
31 | /opt/slurm/bin/srun --ipus=4 python /home/$USER/graphcore/examples/nlp/gpt2/pytorch/train_gpt2.py --model gpt2 --ipus-per-replica 4 --replication-factor 1 --gradient-accumulation 2048 --device-iterations 8 --batch-size 1 --layers-per-ipu 0 4 4 4 --matmul-proportion 0.15 0.15 0.15 0.15 --max-len 1024 --optimizer AdamW --learning-rate 0.00015 --lr-schedule cosine --lr-warmup 0.01 --remap-logit True --enable-sequence-serialized True --embedding-serialization-factor 4 --recompute-checkpoint-every-layer True --enable-half-partials True --replicated-tensor-sharding True --dataset 'generated' --epochs 1
32 | ```
33 | * Run from Precompiled Artifacts 
34 | ```bash
35 | Note: Precompiled artifacts are present at the /software/graphcore/projects/models_compile location for the above models.
36 | copy them to your ~/tmp and set export POPTORCH_CACHE_DIR=~/tmp to skip the compile process.
37 | ```
38 | 
39 | ##### Run GPT2 on 16 IPUs (4 Instances)
40 | ```bash
41 | /opt/slurm/bin/srun --ipus=16 python /home/$USER/graphcore/examples/nlp/gpt2/pytorch/train_gpt2.py --model gpt2 --ipus-per-replica 4 --replication-factor 4 --gradient-accumulation 2048 --device-iterations 8 --batch-size 1 --layers-per-ipu 0 4 4 4 --matmul-proportion 0.15 0.15 0.15 0.15 --max-len 1024 --optimizer AdamW --learning-rate 0.00015 --lr-schedule cosine --lr-warmup 0.01 --remap-logit True --enable-sequence-serialized True --embedding-serialization-factor 4 --recompute-checkpoint-every-layer True --enable-half-partials True --replicated-tensor-sharding True --dataset 'generated' --epochs 1
42 | ```
43 | <details>
44 |   <summary>Sample Output</summary>
45 |   
46 |   ```bash
47 |     srun: job 10697 queued and waiting for resources
48 |     srun: job 10697 has been allocated resources
49 |     Building (if necessary) and loading remap_tensor_ce.
50 |     Failed to find compiled extension; rebuilding.
51 |     Building (if necessary) and loading residual_add_inplace_pattern.
52 |     Model initializing
53 |     -------------------- Device Allocation --------------------
54 |     Embedding  --> IPU 0
55 |     Layer 0  --> IPU 1
56 |     Layer 1  --> IPU 1
57 |     Layer 2  --> IPU 1
58 |     Layer 3  --> IPU 1
59 |     Layer 4  --> IPU 2
60 |     Layer 5  --> IPU 2
61 |     Layer 6  --> IPU 2
62 |     Layer 7  --> IPU 2
63 |     Layer 8  --> IPU 3
64 |     Layer 9  --> IPU 3
65 |     Layer 10 --> IPU 3
66 |     Layer 11 --> IPU 3
67 |     LM_head --> IPU 0
68 | 
69 |     step 0 of epoch 0, loss: 10.913220405578613, acc: 2.0071864128112793e-05, lr: 0.00012803300858899104, throughput: 646.8439205981404 samples/sec
70 |     step 1 of epoch 0, loss: 10.836345672607422, acc: 1.9788742065429688e-05, lr: 7.5e-05, throughput: 1058.0979097185766 samples/sec
71 |     step 2 of epoch 0, loss: 10.831247329711914, acc: 2.0518898963928223e-05, lr: 2.1966991411008938e-05, throughput: 1058.7595523807183 samples/sec
72 |     step 3 of epoch 0, loss: 10.829034805297852, acc: 1.990795135498047e-05, lr: 0.0, throughput: 1059.6762623043378 samples/sec
73 |   ```
74 | </details>
75 | 
76 | 


--------------------------------------------------------------------------------
/05_aiTestbed/Graphcore/graphcore_login.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/05_aiTestbed/Graphcore/graphcore_login.png


--------------------------------------------------------------------------------
/05_aiTestbed/Groq/README.md:
--------------------------------------------------------------------------------
  1 | # Groq 
  2 | 
  3 | ## Connection to Groq
  4 | 
  5 | ![Groq connection diagram](./groqrack_system_diagram.png)
  6 | 
  7 | Login to the Groq login node from your local machine.
  8 | Once you are on the login node, ssh to one of the Groq nodes.
  9 | 
 10 | ```bash
 11 | local > ssh ALCFUserID@groq.ai.alcf.anl.gov
 12 | ```
 13 | ```bash
 14 | groq-login > ssh groq-r01-gn-01.ai.alcf.anl.gov
 15 | # or
 16 | groq-login > ssh groq-r01-gn-09.ai.alcf.anl.gov
 17 | # or any node with hostname of form groq-r01-gn-0[1-9].ai.alcf.anl.gov
 18 | ```
 19 | 
 20 | ## Create Virtual Environment 
 21 | 
 22 | ### Install Miniconda
 23 | 
 24 | ```bash
 25 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
 26 | bash Miniconda3-latest-Linux-x86_64.sh
 27 | ```
 28 | 
 29 | ### PyTorch virtual environment
 30 | 
 31 | ```bash
 32 | export PYTHON_VERSION=3.10.12
 33 | conda create -n groqflow python=$PYTHON_VERSION -y
 34 | conda activate groqflow
 35 | ```
 36 | 
 37 | ### Install Groqflow
 38 | 
 39 | ```bash
 40 | # Alter this if you have cloned groqflow to some other location.
 41 | git clone https://github.com/groq/groqflow.git
 42 | cd ~/groqflow
 43 | if [ -d "groqflow.egg-info" ]; then rm -r groqflow.egg-info; fi
 44 | pip install --upgrade pip
 45 | pip list --format=freeze > frozen.txt
 46 | pip install -r frozen.txt -e .
 47 | pushd . 
 48 | cd demo_helpers
 49 | if [ -d "groqflow_demo_helpers.egg-info" ]; then rm -r groqflow_demo_helpers.egg-info; fi
 50 | pip install -e .
 51 | popd
 52 | pip install soundfile
 53 | ```
 54 | 
 55 | 
 56 | ## Job Queuing and Submission
 57 | 
 58 | Groq jobs in the AI Testbed's groqrack are managed by the PBS job scheduler.
 59 | 
 60 | * `qsub` : to submit a batch job using a script
 61 | * `qstat`: to display queue information
 62 | * `qdel`: to delete (cancel) a job:
 63 | * `qhold`: to hold a job
 64 | 
 65 | ### Schedule batch Job
 66 | 
 67 | <details>
 68 |   <summary>Create a sample run_minilmv2.sh script as below</summary>
 69 | 
 70 |     #!/bin/bash
 71 |     # >>> conda initialize >>>
 72 |     # !! Contents within this block are managed by 'conda init' !!
 73 |     __conda_setup="$(${HOME}'/miniconda3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
 74 |     if [ $? -eq 0 ]; then
 75 |         eval "$__conda_setup"
 76 |     else
 77 |         if [ -f "${HOME}/miniconda3/etc/profile.d/conda.sh" ]; then
 78 |             . "${HOME}/miniconda3/etc/profile.d/conda.sh"
 79 |         else
 80 |             export PATH="${HOME}/miniconda3/bin:$PATH"
 81 |         fi
 82 |     fi
 83 |     unset __conda_setup
 84 |     # <<< conda initialize <<<
 85 |     conda activate groqflow
 86 |     cd ~/groqflow/proof_points/natural_language_processing/minilm
 87 |     pip install -r requirements.txt
 88 |     python minilmv2.py
 89 |     
 90 | </details>
 91 | 
 92 | Ensure you have a groqflow conda environment activated.
 93 | ```bash
 94 | conda activate groqflow
 95 | ```
 96 | 
 97 | Then run the script as a batch job with PBS. This will reserve a full eight-card(chip) node.
 98 | ```bash
 99 | qsub -l  select=1,place=excl run_minilmv2.sh
100 | ```
101 | 
102 | 
103 | ### Schedule Interactive Job
104 | 
105 | Following command gives a single Groq node interactively for 1 hour
106 | ```bash
107 | qsub -I -l walltime=1:00:00 
108 | ```
109 | Other flags that can be used
110 | ```bash
111 | -l ncpus=1 
112 | -l groq_accelerator=1
113 | ```
114 | 
115 | Then activate your groqflow environment, clone the repo and run python scripts with
116 | ```bash
117 | conda activate groqflow
118 | cd ~/groqflow/proof_points/natural_language_processing/minilm
119 | pip install -r requirements.txt
120 | python minilmv2.py
121 | ```
122 | <!--- ## Run Examples
123 | 
124 | Refer to respective instrcutions below 
125 | * [ResNet50](./resnet50.md)
126 | * [MiniLMv2](./minilm.md) --->
127 | 
128 | 
129 | ## Useful Resources 
130 | 
131 | * [ALCF Groq Documenation](https://docs.alcf.anl.gov/ai-testbed/groq/system-overview/)
132 | * [Groq Documentation](https://support.groq.com/#/login)
133 | * [Groq Examples Repository](https://github.com/groq/groqflow/tree/main/proof_points)
134 | 
135 | 


--------------------------------------------------------------------------------
/05_aiTestbed/Groq/groqrack_system_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/05_aiTestbed/Groq/groqrack_system_diagram.png


--------------------------------------------------------------------------------
/05_aiTestbed/README.md:
--------------------------------------------------------------------------------
1 | # Introduction to AI Testbeds at ALCF and hands
2 | 
3 | Please refer to [Slides here](./AI%20Testbeds%20Hands-on%20ATPESC2023.pdf) and [online documentation](https://docs.alcf.anl.gov/ai-testbed/getting-started/)


--------------------------------------------------------------------------------
/05_aiTestbed/SambaNova/README.md:
--------------------------------------------------------------------------------
 1 | # Sambanova
 2 | 
 3 | ## Connection to Sambanova 
 4 | 
 5 | Connection to a SambaNova node is a two-step process. The first step is to `ssh` to the `login node`. The second step is to log in to a SambaNova node from the `login node`.
 6 | 
 7 | ![Sambanova connection diagram](./sambanova_login.jpg)
 8 | 
 9 | Login to the Sambanova login node from your local machine.  This uses the **MobilePASS+** token generated every time you log in to the system. 
10 | 
11 | In the examples below, replace ALCFUserID with your ALCF user id.
12 | ```bash
13 | ssh ALCFUserID@sambanova.alcf.anl.gov
14 | Password: < MobilePASS+ code >
15 | ```
16 | 
17 | Note: Use the ssh "-v" option in order to debug any ssh problems.
18 | 
19 | Once you are on the login node, ssh to one of the sambanova nodes.
20 | ```bash
21 | ssh sn30-r1-h1       
22 | ```
23 | 
24 | You can also ssh to `sn30-r1-h1` , `sn30-r1-h2`, `sn30-r2-h1`, `sn30-r2-h2`, `sn30-r3-h1`, `sn30-r3-h2`, `sn30-r4-h1`, `sn30-r4-h2`.
25 | 
26 | ## Pre-Built Sample Venv 
27 | 
28 | Sambanova software stack and associated environmental variables are automatically setup at login for a SN30 node. 
29 | 
30 | Each of the samples or application examples provided by SambaNova has its own pre-built virtual environment which can be readily used. They are present in the `/opt/sambaflow/apps/` directory tree within each of the applications. 
31 | 
32 | 
33 | ## Run Examples
34 | 
35 | Refer to respective instructions below.
36 | 
37 | Main examples:
38 | * [GPT 1.5B](./gpt15b.md)
39 | 
40 | 
41 | ## Additional Resources
42 | 
43 | * [ALCF Sambanova Documentation](https://docs.alcf.anl.gov/ai-testbed/sambanova/getting-started/)
44 | * [Sambanova Documentation](https://docs.sambanova.ai/developer/latest/sambaflow-intro.html) 
45 | * Sambanova applications path: `/opt/sambaflow/apps/`
46 | * Sambanova model scripts: `/data/ANL/scripts/`
47 | * Important datasets: `/software/sambanova/dataset/`
48 | 


--------------------------------------------------------------------------------
/05_aiTestbed/SambaNova/gpt15b.md:
--------------------------------------------------------------------------------
 1 | # GPT 1.5B Parameter on Sambanova
 2 | 
 3 | ##### Create and and move to the following directory.
 4 | 
 5 | ```bash
 6 | mkdir -p ~/apps/nlp/Gpt1.5B_single
 7 | cd ~/apps/nlp/Gpt1.5B_single
 8 | ```
 9 | 
10 | ##### Copy script to Compile and Run
11 | 
12 | ```bash
13 | cp /data/ANL/scripts/Gpt1.5B_base_single_compile.sh .
14 | cp /data/ANL/scripts/Gpt1.5B_base_single_run.sh .
15 | 
16 | chmod +x Gpt1.5B_base_single_compile.sh
17 | chmod +x Gpt1.5B_base_single_run.sh
18 | ```
19 | 
20 | ##### Run the script to Compile and Run
21 | 
22 | ```bash
23 | ./Gpt1.5B_base_single_compile.sh 32
24 | ```
25 | For more information refer to [Gpt1.5B](https://docs.alcf.anl.gov/ai-testbed/sambanova/example-programs/#gpt-15b). 
26 | 
27 | Precompiled artifacts are available at : /data/scratch/preCompiled/ directory that can be accessed by any compute node. 
28 | Change the OUTDIR path as below in the above scripts
29 | ```bash
30 | OUTDIR=/data/scratch/preCompiled/${MODEL_NAME}
31 | ```
32 | 


--------------------------------------------------------------------------------
/05_aiTestbed/SambaNova/sambanova_login.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/05_aiTestbed/SambaNova/sambanova_login.jpg


--------------------------------------------------------------------------------
/05_aiTestbed/Slides/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/05_aiTestbed/Slides/.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # [ATPESC 2024](https://extremecomputingtraining.anl.gov/agenda-2024/)
 2 | 
 3 | At the beginning of the day, we will temporarily split into two groups. Attendees can choose between "Introduction to deep learning" (01_deepLearning) and "Building data pipelines" (02_dataPipelines).  
 4 | 
 5 | The "Introduction to deep learning" session will rely on Jupyter Notebooks which are targeted for running on [Google's Colaboratory Platform](https://colab.research.google.com) or [ALCF JupyterHub](https://www.alcf.anl.gov/support-center/theta/jupyter-hub). The Colab platform gives the user a virtual machine in which to run Python codes including machine learning codes. The VM comes with a preinstalled environment that includes most of what is needed for these tutorials.
 6 | 
 7 | The other sessions involve Python scripts executed on the [Polaris](https://docs.alcf.anl.gov/polaris/getting-started/) and [AI Testbed](https://www.alcf.anl.gov/alcf-ai-testbed) platforms at ALCF. 
 8 | 
 9 | ## Using Google Colab
10 | Google Colab involves running Jupyter notebooks, which you have experience with from earlier in the week. 
11 | 
12 | Do the following before you come to the tutorial:
13 | *  You need a Google Account to use Colaboratory
14 | *  Go to [Google's Colaboratory Platform](https://colab.research.google.com) 
15 | *  You should see this page
16 | ![start_page](README_imgs/colab_start_page.png)
17 | *  Now you can open the `File` menu at the top left and select `Open Notebook` which will open a dialogue box.
18 | *  Select the `GitHub` tab in the dialogue box.
19 | *  From here you can enter the url for the github repo: `https://github.com/argonne-lcf/ATPESC_MachineLearning` and hit `<enter>`.
20 | ![open_github](README_imgs/colab_open_github.png)
21 | *  This will show you a list of the Notebooks available in the repo. When you select a notebook from this list it will create a copy for you in your Colaboratory account (all `*.ipynb` files in the Colaboratory account will be stored in your Google Drive).
22 | * To use a GPU in the notbook select `Runtime` -> `Change Runtime Type` and select an accelerator.
23 | 


--------------------------------------------------------------------------------
/README_imgs/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/README_imgs/.DS_Store


--------------------------------------------------------------------------------
/README_imgs/colab_clean_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/README_imgs/colab_clean_page.png


--------------------------------------------------------------------------------
/README_imgs/colab_open_github.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/README_imgs/colab_open_github.png


--------------------------------------------------------------------------------
/README_imgs/colab_start_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/README_imgs/colab_start_page.png


--------------------------------------------------------------------------------
/README_imgs/collab_start_page1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/README_imgs/collab_start_page1.png


--------------------------------------------------------------------------------
/extra_statisticalLearning/README.md:
--------------------------------------------------------------------------------
 1 | # Overview of Statistical Learning Methods
 2 | Authors: [Sam Foreman](https://www.samforeman.me)[^1] ([foremans@anl.gov](mailto:///foremans@anl.gov)), Taylor Childers (jchilders@anl.gov), and Romit Maulik (rmaulik@anl.gov)
 3 | 
 4 | - 📕 [Statistical Learning](./src/atpesc/notebooks/statistical_learning.ipynb)
 5 | - 📊 [slides](https://saforem2.github.io/ATPESC-StatisticalLearning)
 6 | 
 7 | To install,
 8 | 
 9 | ```bash
10 | git clone https://github.com/argonne-lcf/ATPESC_MachineLearning
11 | cd ATPESC_MachineLearning/00_statisticalLearning
12 | python3 -m pip install -e .
13 | ```
14 | 
15 | and launch [notebook](./src/atpesc/notebooks/statistical_learning.ipynb)
16 | 
17 | [^1]: Presenter
18 | 


--------------------------------------------------------------------------------
/extra_statisticalLearning/assets/pngs/atpesc-k-means-step1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/assets/pngs/atpesc-k-means-step1.png


--------------------------------------------------------------------------------
/extra_statisticalLearning/assets/pngs/atpesc-k-means-step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/assets/pngs/atpesc-k-means-step2.png


--------------------------------------------------------------------------------
/extra_statisticalLearning/assets/pngs/atpesc-k-means-step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/assets/pngs/atpesc-k-means-step3.png


--------------------------------------------------------------------------------
/extra_statisticalLearning/assets/pngs/atpesc-k-means-step4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/assets/pngs/atpesc-k-means-step4.png


--------------------------------------------------------------------------------
/extra_statisticalLearning/assets/pngs/learning-rate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/assets/pngs/learning-rate.png


--------------------------------------------------------------------------------
/extra_statisticalLearning/images/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/images/image.png


--------------------------------------------------------------------------------
/extra_statisticalLearning/images/kmeans.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/images/kmeans.png


--------------------------------------------------------------------------------
/extra_statisticalLearning/images/mse.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/images/mse.pdf


--------------------------------------------------------------------------------
/extra_statisticalLearning/images/mse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/images/mse.png


--------------------------------------------------------------------------------
/extra_statisticalLearning/images/sgd.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/images/sgd.pdf


--------------------------------------------------------------------------------
/extra_statisticalLearning/images/sgd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/images/sgd.png


--------------------------------------------------------------------------------
/extra_statisticalLearning/images/sgd_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/images/sgd_example.png


--------------------------------------------------------------------------------
/extra_statisticalLearning/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel"]
3 | 


--------------------------------------------------------------------------------
/extra_statisticalLearning/setup.cfg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/setup.cfg


--------------------------------------------------------------------------------
/extra_statisticalLearning/src/atpesc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/argonne-lcf/ATPESC_MachineLearning/80457cb119a8db7906942a3abd373f312ca60a7e/extra_statisticalLearning/src/atpesc/__init__.py


--------------------------------------------------------------------------------
/extra_statisticalLearning/src/atpesc/common.py:
--------------------------------------------------------------------------------
  1 | """
  2 | common.py
  3 | 
  4 | Contains helper functions.
  5 | """
  6 | from __future__ import absolute_import, division, print_function, annotations
  7 | from typing import Any, Optional
  8 | import logging
  9 | 
 10 | import numpy as np
 11 | from sklearn import datasets
 12 | from pathlib import Path
 13 | from sklearn.preprocessing import StandardScaler
 14 | from sklearn.cluster import KMeans
 15 | 
 16 | from atpesc.utils.plots import plot_kmeans_obj
 17 | 
 18 | SEED = 1234
 19 | DEFAULT_LR = float(1e-6)
 20 | 
 21 | HERE = Path(__file__).parent
 22 | SECTION_DIR = HERE.parent.parent
 23 | DATA_DIR = HERE.parent.parent.joinpath('data')
 24 | 
 25 | Array = np.ndarray
 26 | 
 27 | 
 28 | log = logging.getLogger(__name__)
 29 | 
 30 | 
 31 | def predict_price(slope: Any, input_area: Any):
 32 |     return slope * input_area
 33 | 
 34 | 
 35 | def evaluate(slope, input_area, true_price) -> np.floating:
 36 |     price_prediction = predict_price(slope, input_area)
 37 |     return np.mean((true_price - price_prediction) ** 2)
 38 | 
 39 | 
 40 | def learn(
 41 |         input_area: float,
 42 |         input_price: float,
 43 |         input_slope: float,
 44 |         learning_rate: Optional[float] = None
 45 | ) -> np.floating:
 46 |     learning_rate = DEFAULT_LR if learning_rate is None else learning_rate
 47 |     # ------------------------------------
 48 |     # 1. First compute: df/dx, where:
 49 |     #   f = predict_price
 50 |     #   x = input_size
 51 |     # ------------------------------------
 52 |     tmp = (2. * input_area) * (
 53 |         predict_price(input_slope, input_area) - input_price
 54 |     )
 55 |     dfdx = np.mean(tmp)
 56 |     # --------------------------------------------
 57 |     # 2. Update the slope via SGD Update step
 58 |     # --------------------------------------------
 59 |     return input_slope - learning_rate * dfdx
 60 | 
 61 | 
 62 | def load_cancer_data() -> tuple[Array, Array]:
 63 |     """Returns cancer dataset (unscaled)."""
 64 |     from sklearn import datasets
 65 |     import pandas as pd
 66 |     data = datasets.load_breast_cancer()
 67 |     assert isinstance(data, dict)
 68 |     x = pd.DataFrame(
 69 |         data['data'],
 70 |         columns=data['feature_names']
 71 |     )
 72 |     x = x[sorted(x.columns)]
 73 |     y = data['target']
 74 | 
 75 |     return x, y
 76 | 
 77 | 
 78 | def create_run_blob_kmeans(
 79 |         nsamples: int,
 80 |         nfeatures: int,
 81 |         nclusters: int
 82 | ) -> None:
 83 |     x, _ = datasets.make_blobs(
 84 |         n_samples=nsamples,
 85 |         n_features=nfeatures,
 86 |         centers=nclusters,
 87 |         random_state=SEED
 88 |     )
 89 | 
 90 |     # Normalize features
 91 |     x_ = StandardScaler().fit_transform(x)
 92 |     model = KMeans(
 93 |         n_clusters=nclusters,
 94 |         init='k-means++',
 95 |         n_init=10,
 96 |         max_iter=300,
 97 |         random_state=SEED,
 98 |         # n_jobs=4,
 99 |     )
100 |     model.fit(x_)
101 |     plot_kmeans_obj(x_, 20)
102 | 


--------------------------------------------------------------------------------
/introduction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Introductory Material\n",
  8 |     "\n",
  9 |     "Run this introductory IPython Notebook into your [Colaborotory](https://colab.research.google.com).\n",
 10 |     "\n",
 11 |     "This is a 'Markdown' or 'Text' cell where notes can be made to help you remember what you were doing."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# This is a code cell where you can run Python code on the local machine\n",
 21 |     "import tensorflow as tf\n",
 22 |     "print(f'tensorflow version: {tf.__version__}')\n",
 23 |     "# when you're done typing you hit <shift>+<enter> and the cell is executed in the Python Interpreter"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "You can also run bash shell commands using `!<command>`"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": null,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "!echo Hello World"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "!ls"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "Let's checkout the ATPESC_MachineLearning repo to our local Colaboratory Virtual Machine:"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "!git clone https://github.com/argonne-lcf/ATPESC_MachineLearning.git repo"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "!ls repo"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "# Notebook Runtimes\n",
 81 |     "While running a notebook, you must consider that the Python Interpreter is running in the background and will retain all previous executed codes. Mean variables persist until the runtime is killed or reset. This can be done from the 'Runtime' menu above. \n",
 82 |     "\n",
 83 |     "Let's see how this works."
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "x = 42\n",
 93 |     "print(x)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "Now reset/restart the runtime via the menu and run the following:"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# This will cause 'NameError' exception since the variable does not exist.\n",
110 |     "print(x)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "One Colaboratory there is another level of restart/reset called ' Factory reset runtimes ' which not only restarts the Python interpreter, but reboots your VM causing all local files to be lost. For example, run the following command:"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "# list the repo we cloned from git\n",
127 |     "!ls repo"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "Now run 'Factory reset runtimes' from the menu and then run this command"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "# this will throw an error since there is no longer a file\n",
144 |     "!ls repo"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "# recheck out the repo, we want to use this later in our jobs\n",
154 |     "!git clone https://github.com/argonne-lcf/ATPESC_MachineLearning.git repo"
155 |    ]
156 |   }
157 |  ],
158 |  "metadata": {
159 |   "kernelspec": {
160 |    "display_name": "Python 3",
161 |    "language": "python",
162 |    "name": "python3"
163 |   },
164 |   "language_info": {
165 |    "codemirror_mode": {
166 |     "name": "ipython",
167 |     "version": 3
168 |    },
169 |    "file_extension": ".py",
170 |    "mimetype": "text/x-python",
171 |    "name": "python",
172 |    "nbconvert_exporter": "python",
173 |    "pygments_lexer": "ipython3",
174 |    "version": "3.7.4"
175 |   }
176 |  },
177 |  "nbformat": 4,
178 |  "nbformat_minor": 2
179 | }
180 | 


--------------------------------------------------------------------------------