├── CONTRIBUTING.md
├── CommonNativeJarsTable.md
├── DataAnalytics.md
├── HPC
    ├── README.md
    ├── images
    │   └── headNode-info-ec2console.png
    ├── scripts-code_saturne
    │   ├── cs_user_parameters.c
    │   ├── install-codesaturne-gcc-mpi4.sh
    │   └── submit-F128-2-hpc7g-gcc-mpi4.sh
    ├── scripts-gromacs
    │   ├── compile-gromacs-acfl.sh
    │   └── sbatch-gromacs-acfl.sh
    ├── scripts-openfoam
    │   ├── compile-openfoam-acfl.sh
    │   ├── openfoam-v2112-patch.diff
    │   └── sbatch-openfoam-acfl.sh
    ├── scripts-setup
    │   ├── 0-install-acfl.sh
    │   ├── 1-install-armpl.sh
    │   ├── 2a-install-openmpi-with-acfl.sh
    │   ├── 2b-install-openmpi-with-gcc.sh
    │   ├── hpc7g-ubuntu2004-useast1.yaml
    │   ├── install-gcc-11.sh
    │   └── install-tools-headnode-ubuntu2004.sh
    ├── scripts-wrf
    │   ├── 0-install_zlib_1p2.sh
    │   ├── 1-install_hdf5_1p12.sh
    │   ├── 2-install_pnetcdf.sh
    │   ├── 3-install_netcdf_c.sh
    │   ├── 4-install_netcdf_fortran.sh
    │   ├── WRF-v45-patch-acfl.diff
    │   ├── compile-wrf-v45-acfl.sh
    │   ├── diffwrf.py
    │   ├── install-wrf-tools-acfl.sh
    │   ├── netcdf-c-success-message.txt
    │   ├── netcdf-fortran-success-message.txt
    │   ├── pnetcdf-success-message.txt
    │   ├── sbatch-wrf-v45-acfl.sh
    │   └── scripts-wps
    │   │   ├── 0-install_jasper.sh
    │   │   └── compile-wps.sh
    └── setup-an-ec2-hpc-instance.md
├── LICENSE
├── LICENSE-SAMPLECODE
├── LICENSE-SUMMARY
├── Monitoring_Tools_on_Graviton.md
├── R.md
├── README.md
├── SIMD_and_vectorization.md
├── amis_cf_sm.md
├── arm64-assembly-optimization.md
├── aws-lambda
    ├── GravitonLambdaNumber
    │   ├── src
    │   │   ├── Dockerfile
    │   │   ├── app.js
    │   │   └── package.json
    │   ├── template.yml
    │   └── test
    │   │   └── event.json
    ├── PythonPrime
    │   ├── samconfig.toml
    │   ├── src
    │   │   └── app.py
    │   └── template.yaml
    ├── README.md
    └── img
    │   ├── ApiBasePath.png
    │   ├── LambdaGraviton.png
    │   ├── createfunctionfromimage.png
    │   ├── curlarm64.png
    │   ├── curlx86.png
    │   ├── dockerbuild.png
    │   ├── dockerinspect.png
    │   ├── dockerrun.png
    │   ├── dockerrunresponse.png
    │   ├── powertuningcompare.png
    │   ├── powertuningcomparison.png
    │   ├── powertuningstatemachine.png
    │   ├── powertuningx86results.png
    │   ├── primefunctions.png
    │   ├── sambuild.png
    │   ├── sambuildcontainer.png
    │   ├── samdeploy-g.png
    │   └── samlocalinvoke.png
├── c-c++.md
├── containers.md
├── dotnet.md
├── dpdk_spdk.md
├── golang.md
├── howtoresources.md
├── isv.md
├── java.md
├── machinelearning
    ├── llama.cpp.md
    ├── onnx.md
    ├── pytorch.md
    ├── tensorflow.md
    └── vllm.md
├── managed_services.md
├── nodejs.md
├── optimizing.md
├── os.md
├── perfrunbook
    ├── README.md
    ├── appendix.md
    ├── configuring_your_loadgen.md
    ├── configuring_your_sut.md
    ├── debug_code_perf.md
    ├── debug_hw_perf.md
    ├── debug_system_perf.md
    ├── defining_your_benchmark.md
    ├── images
    │   ├── example_breaking_latency_chart.png
    │   ├── jmc_example_image.png
    │   ├── oncpu_example_flamgraph.png
    │   ├── performance_debug_flowchart.png
    │   └── system-load
    │   │   ├── c6i.png
    │   │   ├── c7g-compared-to-c6i.png
    │   │   └── c7g.png
    ├── intro_to_benchmarking.md
    ├── optimization_recommendation.md
    ├── references.md
    ├── system-load-and-compute-headroom.md
    └── utilities
    │   ├── capture_flamegraphs.sh
    │   ├── configure_graviton_metal_iommu.sh
    │   ├── configure_mem_size.sh
    │   ├── configure_vcpus.sh
    │   ├── find_and_list_jar_with_so.sh
    │   ├── find_and_list_pylib_with_so.sh
    │   ├── install_bcc_tools_al2.sh
    │   ├── install_perfrunbook_dependencies.sh
    │   ├── measure_aggregated_pmu_stats.py
    │   ├── measure_and_plot_basic_pmu_counters.py
    │   ├── measure_and_plot_basic_sysstat_stats.py
    │   ├── mpstat_parse.py
    │   └── sar_parse.py
├── php-opcache-al2.md
├── php.md
├── python.md
├── runtime-feature-detection.md
├── rust.md
├── sample-code
    ├── crc.c
    ├── hwcaps-test.c
    ├── hwcaps.c
    └── lambda_region_finder.sh
├── software
    ├── ChromeAndPuppeteer.md
    └── librdkafka.md
└── transition-guide.md


/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 
61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
62 | 


--------------------------------------------------------------------------------
/CommonNativeJarsTable.md:
--------------------------------------------------------------------------------
 1 | ## Commonly used Jars that package native artifacts
 2 | 
 3 | Org  | jar  | Builds on Arm | Arm Artifact available | Minimum Version
 4 | -----|------|---------------|------------------------|---------
 5 | com.github.luben | [zstd-jni](https://github.com/luben/zstd-jni) | yes | [yes](https://mvnrepository.com/artifact/com.github.luben/zstd-jni) | 1.2.0
 6 | org.lz4 | [lz4-java](https://github.com/lz4/lz4-java) | yes | [yes](https://mvnrepository.com/artifact/org.lz4/lz4-java) | 1.4.0
 7 | org.xerial.snappy | [snappy-java](https://github.com/xerial/snappy-java) | yes | [yes](https://mvnrepository.com/artifact/org.xerial.snappy/snappy-java) | 1.1.4
 8 | org.rocksdb | [rocksdbjni](https://github.com/facebook/rocksdb/tree/master/java) | yes | [yes](https://mvnrepository.com/artifact/org.rocksdb/rocksdbjni) | 5.0.1 (7.4.3+ recommended)
 9 | com.github.jnr | [jffi](https://github.com/jnr/jffi) | yes | [yes](https://mvnrepository.com/artifact/com.github.jnr/jffi) | 1.2.13
10 | org.apache.commons | [commons-crypto](https://github.com/apache/commons-crypto) | yes | [yes](https://search.maven.org/artifact/org.apache.commons/commons-crypto/1.1.0/jar) | 1.1.0
11 | io.netty | [netty-transport-native-epoll](https://github.com/netty/netty) | yes | [yes](https://mvnrepository.com/artifact/io.netty/netty-transport-native-epoll) | 4.1.50
12 | io.netty | [netty-tcnative](https://github.com/netty/netty-tcnative) | yes | [yes](https://mvnrepository.com/artifact/io.netty/netty-tcnative) | 2.0.31
13 | org.fusesource.jansi | [jansi-native](https://github.com/fusesource/jansi-native) | yes | no |
14 | org.fusesource.leveldbjni | [leveldbjni-all](https://github.com/fusesource/leveldbjni) | no | no |
15 | org.fusesource.sigar | [sigar](https://github.com/hyperic/sigar) | yes (refer https://github.com/hyperic/sigar/pull/140) | [debian](https://pkgs.org/download/libhyperic-sigar-java) | 1.6.4
16 | org.apache.hadoop | [hadoop-lzo](https://github.com/twitter/hadoop-lzo) | yes | no |
17 | 
18 | ---
19 | Updated on 2022-08-02
20 | 


--------------------------------------------------------------------------------
/DataAnalytics.md:
--------------------------------------------------------------------------------
 1 | # Spark on Graviton
 2 | 
 3 | Apache Spark is a data processing framework widely used to extract information from large pools of data.
 4 | One main problem that affects performance is the straggler, where a long-running task slows down the entire cluster. Stragglers in Spark are usually caused by non-uniform distribution of work or data being skewed non uniformly across nodes, resulting in a single task taking up more work. Our goal should be to keep all the CPUs busy and not have a small set of cores executing long running tasks. Setting up a correct configuration depends on the dataset size, number of instances, core count/instance, and computational complexity.
 5 | 
 6 | Below are some general guidelines that can be referenced by users trying to improve overall application performance across Spark clusters. Since there is no one standard config set that works well in all cases, we advise users to benchmark real applications after following the below guidelines. 
 7 | 
 8 | 1. Shuffle Partitions: This configuration option is important to mitigate performance issues due to stragglers. Recommendations are
 9 |     * To have a partition size to be less than 200 MB to gain optimized performance
10 |     * The number of partitions should be multiples of the number of cores available (1xcores, 2xcores .. etc)
11 | 
12 |     Below are the benchmark results showing the total execution time by varying shuffle partitions value in Spark. Benchmarking is performed on Spark cluster with 128 vCPUs spread across 8 Graviton3 instances, executing queries on 1 TB TPC-DS dataset.
13 |     We have seen 80% improvement in performance when using optimized value vs non-optimized value. 
14 | 
15 | 
16 | |shuffle_partitions	|Execution time (mins)	|%Diff	|
17 | |---	|---	|---	|
18 | |10	|175	|Baseline	|
19 | |16	|117	|-33%	|
20 | |30	|72	|-59%	|
21 | |64	|50	|-71%	|
22 | |128	|48	|-73%	|
23 | |256	|39	|-78%	|
24 | |512	|37	|-79%	|
25 | |1024	|35	|-80%	|
26 | |2000	|35	|-80%	|
27 | 
28 | *Lower numbers are better, and negative % diff means faster. Benchmarked on Spark 3.3.0 with Java 17 using spark-sql-perf framework from Databricks*
29 | 
30 | 
31 | 
32 | 2. When using Amazon EMR to setup Spark cluster, it is recommended to use EMR defaults for configuration options. For any other specific cases that need specific tuning, the general optimization guide can be referenced from https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-performance.html.
33 | 3. Adaptive Query Execution(AQE) is an optimization technique in Spark SQL that makes use of runtime statistics to choose the most efficient query execution plan, which is enabled by default since Apache Spark 3.2.0. For users, using older Spark versions, we recommend turning it on and seeing if it improves performance. (https://spark.apache.org/docs/latest/sql-performance-tuning.html#adaptive-query-execution)
34 | 4. We have seen 40% improvement in performance when using Spark 3 with Java 17 when compared to Spark 2 with Java 8. So we recommend using latest Spark 3 with Java 17.


--------------------------------------------------------------------------------
/HPC/images/headNode-info-ec2console.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/HPC/images/headNode-info-ec2console.png


--------------------------------------------------------------------------------
/HPC/scripts-code_saturne/install-codesaturne-gcc-mpi4.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd /shared/tools
 3 | 
 4 | module use /shared/arm/modulefiles
 5 | module load armpl
 6 | export PATH=/shared/openmpi-4.1.6/bin:$PATH
 7 | export LD_LIBRARY_PATH=/shared/openmpi-4.1.6/lib:$LD_LIBRARY_PATH
 8 | export CC=mpicc
 9 | export CXX=mpicxx
10 | export FC=mpif90
11 | export F77=mpif90
12 | export F90=mpif90
13 | 
14 | if [ ! -d code_saturne-8.0.2 ]; then
15 |     wget https://www.code-saturne.org/releases/code_saturne-8.0.2.tar.gz
16 |     tar xf code_saturne-8.0.2.tar.gz
17 | fi
18 | cd code_saturne-8.0.2
19 | 
20 | PREFIX=/shared/code_saturne_8.0-mpi4
21 | mkdir build-mpi4
22 | cd build-mpi4
23 | 
24 | ../configure CC=${CC} CXX=${CXX} FC=${FC} \
25 |     --with-blas=$ARMPL_LIBRARIES --prefix=$PREFIX \
26 |     --disable-gui --without-med \
27 |     --without-hdf5 --without-cgns \
28 |     --without-metis --disable-salome \
29 |     --without-salome --without-eos \
30 |     --disable-static --enable-long-gnum \
31 |     --enable-profile
32 | 
33 | make -j
34 | make install
35 | 


--------------------------------------------------------------------------------
/HPC/scripts-code_saturne/submit-F128-2-hpc7g-gcc-mpi4.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --wait-all-nodes=1
 3 | #SBATCH --ntasks-per-node=64
 4 | #SBATCH --nodes=3
 5 | #SBATCH --cpus-per-task=1
 6 | #SBATCH --ntasks-per-core=1
 7 | #SBATCH --export=ALL
 8 | #SBATCH --partition=compute-hpc7g
 9 | #SBATCH --exclusive
10 | 
11 | module purge
12 | module use /shared/arm/modulefiles
13 | module load armpl/23.04.1_gcc-11.3
14 | export LD_LIBRARY_PATH="/shared/code_saturne_8.0-mpi4/lib":$LD_LIBRARY_PATH
15 | export PATH=/shared/openmpi-4.1.6/bin:$PATH
16 | export LD_LIBRARY_PATH="/shared/openmpi-4.1.6/lib":$LD_LIBRARY_PATH
17 | 
18 | ulimit -s unlimited
19 | export OMP_NUM_THREADS=1
20 | 
21 | export KMP_AFFINITY=compact,verbose
22 | export FI_EFA_FORK_SAFE=1
23 | echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
24 | 
25 | export solver_bin=./cs_solver
26 | cd /shared/data-codesaturne/saturne-open-cases/BUNDLE/BENCH_F128_02/RESU/20240304-1655
27 | 
28 | mpirun --report-bindings --bind-to core -n ${SLURM_NTASKS} $solver_bin "$@"
29 | export CS_RET=$?
30 | 
31 | exit $CS_RET
32 | 


--------------------------------------------------------------------------------
/HPC/scripts-gromacs/compile-gromacs-acfl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Usage: ./compile-gromacs-acfl.sh <gromacs_version>(optional, default version is 2022.4)
 3 | # the Gromacs releases can be found at https://manual.gromacs.org/
 4 | gromacs_version=2022.4
 5 | if [ ! -z "$1" ]
 6 | then
 7 |   gromacs_version=$1
 8 | fi
 9 | 
10 | cd /shared/tools
11 | wget -q http://ftp.gromacs.org/pub/gromacs/gromacs-${gromacs_version}.tar.gz
12 | tar xf gromacs-${gromacs_version}.tar.gz
13 | mkdir -p gromacs-${gromacs_version}/build_mpi-acfl && cd gromacs-${gromacs_version}/build_mpi-acfl
14 | 
15 | export ROOT=/shared
16 | CURDIR=${ROOT}/gromacs-${gromacs_version}-acfl
17 | export PATH=/shared/openmpi-4.1.4-acfl/bin:$PATH
18 | export LD_LIBRARY_PATH=/shared/openmpi-4.1.4-acfl/lib:$LD_LIBRARY_PATH
19 | module use /shared/arm/modulefiles
20 | module load acfl armpl
21 | 
22 | export LDFLAGS="-lgfortran -lamath -lm -lastring"
23 | cmake .. -DGMX_BUILD_OWN_FFTW=OFF \
24 |       -DREGRESSIONTEST_DOWNLOAD=ON \
25 |       -DCMAKE_C_FLAGS="-mcpu=neoverse-512tvb --param=aarch64-autovec-preference=4 -g" \
26 |       -DCMAKE_CXX_FLAGS="-mcpu=neoverse-512tvb --param=aarch64-autovec-preference=4 -g" \
27 |       -DCMAKE_C_COMPILER=$(which mpicc) \
28 |       -DCMAKE_CXX_COMPILER=$(which mpicxx) \
29 |       -DGMX_OMP=ON \
30 |       -DGMX_MPI=ON \
31 |       -DGMX_SIMD=ARM_SVE \
32 |       -DGMX_BUILD_MDRUN_ONLY=OFF \
33 |       -DGMX_DOUBLE=OFF \
34 |       -DCMAKE_INSTALL_PREFIX=${CURDIR} \
35 |       -DBUILD_SHARED_LIBS=OFF \
36 |       -DGMX_FFT_LIBRARY=fftw3 \
37 |       -DFFTWF_LIBRARY=${ARMPL_LIBRARIES}/libarmpl_lp64.so \
38 |       -DFFTWF_INCLUDE_DIR=${ARMPL_INCLUDES} \
39 |       \
40 |       -DGMX_BLAS_USER=${ARMPL_LIBRARIES}/libarmpl_lp64.so \
41 |       -DGMX_LAPACK_USER=${ARMPL_LIBRARIES}/libarmpl_lp64.so \
42 |       \
43 |       -DGMXAPI=OFF \
44 |       -DGMX_GPU=OFF
45 | 
46 | make 
47 | make  install
48 | cd ..
49 | 


--------------------------------------------------------------------------------
/HPC/scripts-gromacs/sbatch-gromacs-acfl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --wait-all-nodes=1
 3 | #SBATCH --ntasks-per-node=64
 4 | #SBATCH --ntasks=64
 5 | #SBATCH --cpus-per-task=1
 6 | #SBATCH --ntasks-per-core=1
 7 | #SBATCH --export=ALL
 8 | #SBATCH --partition=compute
 9 | #SBATCH --exclusive
10 | 
11 | export PATH=/shared/gromacs-2022.4-acfl/bin:$PATH
12 | export LD_LIBRARY_PATH=/shared/gromacs-2022.4-acfl/lib:$LD_LIBRARY_PATH
13 | export PATH=/shared/openmpi-4.1.4-acfl/bin:$PATH
14 | export LD_LIBRARY_PATH=/shared/openmpi-4.1.4-acfl/lib:$LD_LIBRARY_PATH
15 | module use /shared/arm/modulefiles
16 | module load acfl armpl
17 | 
18 | [ ! -d /shared/data-gromacs/benchRIB ] && mkdir -p /shared/data-gromacs/benchRIB
19 | cd /shared/data-gromacs/benchRIB
20 | 
21 | bench=RIB
22 | if [ ! -f bench${bench} ]; then
23 |   wget https://www.mpinat.mpg.de/bench${bench}
24 |   unzip bench${bench}
25 | fi
26 | 
27 | export OMP_NUM_THREADS=${SLURM_CPUS_PER_TASK}
28 | export FI_EFA_FORK_SAFE=1
29 | echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
30 | 
31 | mpirun -np ${SLURM_NTASKS} --report-bindings gmx_mpi mdrun -v -maxh 0.25 -deffnm bench${bench} -ntomp ${OMP_NUM_THREADS} -resethway &>> bench${bench}.out
32 | 


--------------------------------------------------------------------------------
/HPC/scripts-openfoam/compile-openfoam-acfl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Usage: ./compile_openfoam-acfl.sh <openfoam_version>(optional, default version is v2112)
 3 | # openFOAM releases can be found at https://www.openfoam.com/download/release-history
 4 | 
 5 | openfoam_version=v2112
 6 | if [ ! -z "$1" ]
 7 | then
 8 |   openfoam_version=$1
 9 | fi
10 | 
11 | mkdir -p /shared/tools/openfoam-root && cd /shared/tools/openfoam-root
12 | export PATH=/shared/openmpi-4.1.4-acfl/bin:$PATH
13 | export LD_LIBRARY_PATH=/shared/openmpi-4.1.4-acfl/lib:$LD_LIBRARY_PATH
14 | module use /shared/arm/modulefiles 
15 | module load acfl armpl
16 | 
17 | [ -d openfoam ] || git clone -b OpenFOAM-${openfoam_version} https://develop.openfoam.com/Development/openfoam.git
18 | [ -d ThirdParty-common ] || git clone -b ${openfoam_version} https://develop.openfoam.com/Development/ThirdParty-common.git
19 | 
20 | pushd ThirdParty-common
21 | scotch_version="6.1.0"
22 | git clone -b v${scotch_version} https://gitlab.inria.fr/scotch/scotch.git scotch_${scotch_version}
23 | popd
24 | cd openfoam
25 | 
26 | # a patch required for ACfL or GCC-12 (https://develop.openfoam.com/Development/openfoam/-/commit/91198eaf6a0c11b57446374d97a079ca95cf1412)
27 | wget https://raw.githubusercontent.com/aws/aws-graviton-getting-started/main/HPC/scripts-openfoam/openfoam-v2112-patch.diff
28 | git apply openfoam-v2112-patch.diff
29 | 
30 | sed -i -e "s/WM_COMPILER=Gcc/WM_COMPILER=Arm/g" etc/bashrc
31 | source etc/bashrc || echo "Non-zero exit of source etc/bashrc"
32 | ./Allwmake -j 
33 | 


--------------------------------------------------------------------------------
/HPC/scripts-openfoam/openfoam-v2112-patch.diff:
--------------------------------------------------------------------------------
 1 | diff --git a/etc/bashrc b/etc/bashrc
 2 | index 1b24ab2b2b..2c5bcbc3cf 100644
 3 | --- a/etc/bashrc
 4 | +++ b/etc/bashrc
 5 | @@ -69,7 +69,7 @@ export WM_COMPILER_TYPE=system
 6 |  # = Gcc | Clang | Icc | Icx | Amd | Arm | Cray | Fujitsu |
 7 |  #   Gcc<digits> | Clang<digits>
 8 |  #   [Not well tested: Pgi | Nvidia]
 9 | -export WM_COMPILER=Gcc
10 | +export WM_COMPILER=Arm
11 |  
12 |  # [WM_PRECISION_OPTION] - Floating-point precision:
13 |  # = DP | SP | SPDP
14 | diff --git a/src/OpenFOAM/db/IOstreams/hashes/OSHA1stream.H b/src/OpenFOAM/db/IOstreams/hashes/OSHA1stream.H
15 | index 763745aa25..28c84a64b3 100644
16 | --- a/src/OpenFOAM/db/IOstreams/hashes/OSHA1stream.H
17 | +++ b/src/OpenFOAM/db/IOstreams/hashes/OSHA1stream.H
18 | @@ -32,8 +32,8 @@ Description
19 |  
20 |  \*---------------------------------------------------------------------------*/
21 |  
22 | -#ifndef OSHA1stream_H
23 | -#define OSHA1stream_H
24 | +#ifndef Foam_OSHA1stream_H
25 | +#define Foam_OSHA1stream_H
26 |  
27 |  #include "OSstream.H"
28 |  #include "SHA1.H"
29 | @@ -63,10 +63,17 @@ class osha1stream
30 |  
31 |      protected:
32 |  
33 | +        //- Handle overflow
34 | +        virtual int overflow(int c = EOF)
35 | +        {
36 | +            if (c != EOF) sha1_.append(c);
37 | +            return c;
38 | +        }
39 | +
40 |          //- Put sequence of characters
41 |          virtual std::streamsize xsputn(const char* s, std::streamsize n)
42 |          {
43 | -            sha1_.append(s, n);
44 | +            if (n) sha1_.append(s, n);
45 |              return n;
46 |          }
47 |  
48 | diff --git a/src/OpenFOAM/primitives/hashes/SHA1/SHA1.H b/src/OpenFOAM/primitives/hashes/SHA1/SHA1.H
49 | index 9d9e617a48..f7dc764860 100644
50 | --- a/src/OpenFOAM/primitives/hashes/SHA1/SHA1.H
51 | +++ b/src/OpenFOAM/primitives/hashes/SHA1/SHA1.H
52 | @@ -42,8 +42,8 @@ SourceFiles
53 |  
54 |  \*---------------------------------------------------------------------------*/
55 |  
56 | -#ifndef SHA1_H
57 | -#define SHA1_H
58 | +#ifndef Foam_SHA1_H
59 | +#define Foam_SHA1_H
60 |  
61 |  #include <string>
62 |  #include <cstdint>
63 | @@ -113,6 +113,9 @@ public:
64 |          //- Reset the hashed data before appending more
65 |          void clear() noexcept;
66 |  
67 | +        //- Append single character
68 | +        inline void append(char c);
69 | +
70 |          //- Append data for processing
71 |          inline SHA1& append(const char* str);
72 |  
73 | diff --git a/src/OpenFOAM/primitives/hashes/SHA1/SHA1I.H b/src/OpenFOAM/primitives/hashes/SHA1/SHA1I.H
74 | index b04b3b6161..d5587a1cdc 100644
75 | --- a/src/OpenFOAM/primitives/hashes/SHA1/SHA1I.H
76 | +++ b/src/OpenFOAM/primitives/hashes/SHA1/SHA1I.H
77 | @@ -52,6 +52,12 @@ inline Foam::SHA1::SHA1(const std::string& str)
78 |  
79 |  // * * * * * * * * * * * * * * * Member Functions  * * * * * * * * * * * * * //
80 |  
81 | +inline void Foam::SHA1::append(char c)
82 | +{
83 | +    processBytes(&c, 1);
84 | +}
85 | +
86 | +
87 |  inline Foam::SHA1& Foam::SHA1::append(const char* data, size_t len)
88 |  {
89 |      processBytes(data, len);
90 | 


--------------------------------------------------------------------------------
/HPC/scripts-openfoam/sbatch-openfoam-acfl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --wait-all-nodes=1
 3 | #SBATCH --ntasks-per-node=64
 4 | #SBATCH --ntasks=64
 5 | #SBATCH --cpus-per-task=1
 6 | #SBATCH --ntasks-per-core=1
 7 | #SBATCH --export=ALL
 8 | #SBATCH --partition=compute
 9 | #SBATCH --exclusive
10 | 
11 | # load OpenFOAM environment settings
12 | export FI_EFA_FORK_SAFE=1
13 | export WM_PROJECT_DIR=/shared/tools/openfoam-root/openfoam
14 | source $WM_PROJECT_DIR/bin/tools/RunFunctions
15 | source $WM_PROJECT_DIR/etc/bashrc
16 | export PATH=/shared/openmpi-4.1.4-acfl/bin:$PATH
17 | export LD_LIBRARY_PATH=/shared/openmpi-4.1.4-acfl/lib:$LD_LIBRARY_PATH
18 | module use /shared/arm/modulefiles 
19 | module load acfl armpl
20 | 
21 | workdir=/shared/data-openfoam/motorBike-70M
22 | mkdir -p $workdir && cd $workdir
23 | cp -rp ${WM_PROJECT_DIR}/tutorials/incompressible/pisoFoam/LES/motorBike/motorBike/ .
24 | cp -rp ${WM_PROJECT_DIR}/tutorials/incompressible/pisoFoam/LES/motorBike/lesFiles/ .
25 | cd motorBike
26 | 
27 | mkdir constant/triSurface
28 | cp -f \
29 | "$FOAM_TUTORIALS"/resources/geometry/motorBike.obj.gz \
30 | constant/triSurface/
31 | mkdir log
32 | 
33 | # use 64 processes for the simulation and domain decomposition is 4 subdomains in x, y and z direction
34 | sed -i 's/numberOfSubdomains 8;/numberOfSubdomains 64;/' system/decomposeParDict
35 | sed -i 's/(4 2 1);/(4 4 4);/' system/decomposeParDict
36 | blockMesh > ./log/blockMesh.log
37 | decomposePar -decomposeParDict system/decomposeParDict > log/decomposePar.log
38 | 
39 | mkdir -p 0
40 | sed -i 's/mergeTolerance 1E-6/mergeTolerance 1E-5/' system/snappyHexMeshDict
41 | mpirun -np 64 snappyHexMesh -parallel -decomposeParDict system/decomposeParDict.ptscotch -profiling -overwrite > log/snappyHexMesh.log
42 | find . -iname '*level*' -type f -delete
43 | restore0Dir -processor
44 | cp 0.orig/* 0 -rp
45 | 
46 | mpirun -np 64 renumberMesh -parallel -decomposeParDict system/decomposeParDict.ptscotch -overwrite -constant > ./log/renumberMesh.log
47 | mpirun -np 64 potentialFoam -parallel -decomposeParDict system/decomposeParDict.ptscotch -noFunctionObjects -initialiseUBCs > ./log/potentialFoam.log
48 | mpirun -np 64 simpleFoam -parallel -decomposeParDict system/decomposeParDict.ptscotch > ./log/simpleFoam.log
49 | 
50 | 


--------------------------------------------------------------------------------
/HPC/scripts-setup/0-install-acfl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # download acfl for ubuntu 20.04 from arm website - https://developer.arm.com/downloads/-/arm-compiler-for-linux
 4 | # please check the download link for the appropriate version
 5 | # install acfl will include armpl automatically
 6 | mkdir -p /shared/tools
 7 | cd /shared/tools
 8 | wget -O arm-compiler-for-linux_23.04.1_Ubuntu-20.04_aarch64.tar 'https://developer.arm.com/-/media/Files/downloads/hpc/arm-compiler-for-linux/23-04-1/arm-compiler-for-linux_23.04.1_Ubuntu-20.04_aarch64.tar'
 9 | tar xf arm-compiler-for-linux_23.04.1_Ubuntu-20.04_aarch64.tar
10 | ./arm-compiler-for-linux_23.04.1_Ubuntu-20.04/arm-compiler-for-linux_23.04.1_Ubuntu-20.04.sh \
11 | -i /shared/arm -a --force
12 | 


--------------------------------------------------------------------------------
/HPC/scripts-setup/1-install-armpl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Find the download link to ArmPL (Ubuntu 20.04, GCC-12) on https://developer.arm.com/downloads/-/arm-performance-libraries
 4 | # please check the download link for the appropriate version
 5 | mkdir -p /shared/tools && cd /shared/tools
 6 | wget -O arm-performance-libraries_23.04_Ubuntu-20.04_gcc-10.2.tar 'https://developer.arm.com/-/media/Files/downloads/hpc/arm-performance-libraries/23-04/ubuntu-20/arm-performance-libraries_23.04_Ubuntu-20.04_gcc-12.2.tar'
 7 | tar xf arm-performance-libraries_23.04_Ubuntu-20.04_gcc-10.2.tar
 8 | cd arm-performance-libraries_23.04_Ubuntu-20.04/
 9 | ./arm-performance-libraries_23.04_Ubuntu-20.04.sh -i /shared/arm -a --force
10 | 


--------------------------------------------------------------------------------
/HPC/scripts-setup/2a-install-openmpi-with-acfl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # compile a copy of Open MPI with ACFL
 4 | export INSTALLDIR=/shared
 5 | export OPENMPI_VERSION=4.1.4
 6 | module use /shared/arm/modulefiles
 7 | module load acfl
 8 | export CC=armclang
 9 | export CXX=armclang++
10 | export FC=armflang
11 | export CFLAGS="-mcpu=neoverse-512tvb"
12 | 
13 | OS_NAME=unknown
14 | grep -iq "Amazon Linux 2" /etc/os-release 2>/dev/null && OS_NAME=alinux2
15 | grep -iq "Ubuntu 20.04.6 LTS" /etc/os-release 2>/dev/null && OS_NAME=ubuntu
16 | 
17 | if [ "$OS_NAME" = "alinux2" ]
18 | then
19 | 	EFA_LIB_DIR=/opt/amazon/efa/lib64
20 | elif [ "$OS_NAME" = "ubuntu" ]
21 | then
22 | 	EFA_LIB_DIR=/opt/amazon/efa/lib
23 | fi
24 | 
25 | cd /shared/tools
26 | wget -N https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.4.tar.gz
27 | tar -xzvf openmpi-4.1.4.tar.gz
28 | cd openmpi-4.1.4
29 | mkdir build-acfl
30 | cd build-acfl
31 | ../configure --prefix=${INSTALLDIR}/openmpi-${OPENMPI_VERSION}-acfl --enable-mpirun-prefix-by-default --without-verbs --disable-man-pages --enable-builtin-atomics --with-libfabric=/opt/amazon/efa  --with-libfabric-libdir=${EFA_LIB_DIR}
32 | make -j$(nproc) && make install
33 | 
34 | 


--------------------------------------------------------------------------------
/HPC/scripts-setup/2b-install-openmpi-with-gcc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # download and compile Open MPI with GCC for customers who wants to use GCC
 4 | export INSTALLDIR=/shared
 5 | export OPENMPI_VERSION=4.1.4
 6 | export CC=gcc
 7 | export CXX=g++
 8 | export FC=gfortran
 9 | export CFLAGS="-march=native"
10 | OS_NAME=unknown
11 | grep -iq "Amazon Linux 2" /etc/os-release 2>/dev/null && OS_NAME=alinux2
12 | grep -iq "Ubuntu 20.04.6 LTS" /etc/os-release 2>/dev/null && OS_NAME=ubuntu
13 | 
14 | if [ "$OS_NAME" = "alinux2" ]
15 | then
16 | 	EFA_LIB_DIR=/opt/amazon/efa/lib64
17 | elif [ "$OS_NAME" = "ubuntu" ]
18 | then
19 | 	EFA_LIB_DIR=/opt/amazon/efa/lib
20 | fi
21 | 
22 | cd /shared/tools
23 | wget -N https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.4.tar.gz
24 | tar -xzvf openmpi-4.1.4.tar.gz
25 | cd openmpi-4.1.4
26 | mkdir build
27 | cd build
28 | ../configure --prefix=${INSTALLDIR}/openmpi-${OPENMPI_VERSION} --enable-mpirun-prefix-by-default --without-verbs --disable-man-pages --enable-builtin-atomics --with-libfabric=/opt/amazon/efa  --with-libfabric-libdir=/opt/amazon/efa/lib
29 | make -j$(nproc) && make install
30 | 
31 | 


--------------------------------------------------------------------------------
/HPC/scripts-setup/hpc7g-ubuntu2004-useast1.yaml:
--------------------------------------------------------------------------------
 1 | Region: us-east-1
 2 | Imds:
 3 |   ImdsSupport: v2.0
 4 | Image:
 5 |   Os: ubuntu2004
 6 | HeadNode:
 7 |   InstanceType: c7g.4xlarge
 8 |   Networking:
 9 |     SubnetId: subnet-xxxxxxxxxxxxxxxxx
10 |   Ssh:
11 |     KeyName: xxxxxxxxx
12 |   Iam:
13 |     S3Access:
14 |       - EnableWriteAccess: true
15 |         BucketName: '*'
16 |   CustomActions:
17 |     OnNodeConfigured:
18 |       Script: s3://<s3_bucket>/install-gcc-11.sh
19 | Scheduling:
20 |   Scheduler: slurm
21 |   SlurmQueues:
22 |   - Name: compute
23 |     ComputeResources:
24 |     - Name: hpc7g
25 |       InstanceType: hpc7g.16xlarge
26 |       MinCount: 0
27 |       MaxCount: 64
28 |       Efa:
29 |         Enabled: true
30 |     Networking:
31 |       SubnetIds:
32 |       - subnet-xxxxxxxxxxxxxxxxx
33 |       PlacementGroup:
34 |        Enabled: true
35 |     Iam:
36 |       AdditionalIamPolicies:
37 |       - Policy: arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess
38 |     CustomActions:
39 |       OnNodeConfigured:
40 |         Script: s3://<s3_bucket>/install-gcc-11.sh
41 | SharedStorage:
42 |   - Name: FsxLustre0
43 |     StorageType: FsxLustre
44 |     MountDir: /shared
45 |     FsxLustreSettings:
46 |       StorageCapacity: 1200
47 |       DeploymentType: PERSISTENT_2
48 |       PerUnitStorageThroughput: 125
49 |       DataCompressionType: LZ4
50 | 
51 | 


--------------------------------------------------------------------------------
/HPC/scripts-setup/install-gcc-11.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | sudo apt update -y
 5 | 
 6 | sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
 7 | sudo apt install -y gcc-11 g++-11 gfortran-11
 8 | sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 --slave /usr/bin/g++ g++ /usr/bin/g++-11 --slave /usr/bin/gcov gcov /usr/bin/gcov-11 --slave /usr/bin/gfortran gfortran /usr/bin/gfortran-11
 9 | 
10 | 


--------------------------------------------------------------------------------
/HPC/scripts-setup/install-tools-headnode-ubuntu2004.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # download acfl for ubuntu 20.04 from arm website - https://developer.arm.com/downloads/-/arm-compiler-for-linux
 5 | # install acfl will include armpl automatically
 6 | mkdir -p /shared/tools
 7 | cd /shared/tools
 8 | wget -O arm-compiler-for-linux_23.04.1_Ubuntu-20.04_aarch64.tar 'https://developer.arm.com/-/media/Files/downloads/hpc/arm-compiler-for-linux/23-04-1/arm-compiler-for-linux_23.04.1_Ubuntu-20.04_aarch64.tar?rev=52971e8fa8a8498c834e48776dfd1ca5&revision=52971e8f-a8a8-498c-834e-48776dfd1ca5'
 9 | tar xf arm-compiler-for-linux_23.04.1_Ubuntu-20.04_aarch64.tar
10 | ./arm-compiler-for-linux_23.04.1_Ubuntu-20.04/arm-compiler-for-linux_23.04.1_Ubuntu-20.04.sh \
11 | -i /shared/arm -a --force
12 | 
13 | # compile a copy of Open MPI with ACFL
14 | export INSTALLDIR=/shared
15 | export OPENMPI_VERSION=4.1.4
16 | module use /shared/arm/modulefiles
17 | module load acfl
18 | export CC=armclang
19 | export CXX=armclang++
20 | export FC=armflang
21 | export CFLAGS="-mcpu=neoverse-512tvb"
22 | cd /shared/tools
23 | wget -N https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-4.1.4.tar.gz
24 | tar -xzvf openmpi-4.1.4.tar.gz
25 | cd openmpi-4.1.4
26 | mkdir build-acfl
27 | cd build-acfl
28 | ../configure --prefix=${INSTALLDIR}/openmpi-${OPENMPI_VERSION}-acfl --enable-mpirun-prefix-by-default --with-sge --without-verbs --disable-man-pages --enable-builtin-atomics --with-libfabric=/opt/amazon/efa  --with-libfabric-libdir=/opt/amazon/efa/lib
29 | make -j$(nproc) && make install
30 | 
31 | # download and compile Open MPI with GCC for customers who wants to use GCC
32 | export CC=gcc
33 | export CXX=g++
34 | export FC=gfortran
35 | export CFLAGS="-march=native"
36 | cd /shared/tools
37 | cd openmpi-4.1.4
38 | mkdir build
39 | cd build
40 | ../configure --prefix=${INSTALLDIR}/openmpi-${OPENMPI_VERSION} --enable-mpirun-prefix-by-default --with-sge --without-verbs --disable-man-pages --enable-builtin-atomics --with-libfabric=/opt/amazon/efa  --with-libfabric-libdir=/opt/amazon/efa/lib
41 | make -j$(nproc) && make install
42 | 
43 | 


--------------------------------------------------------------------------------
/HPC/scripts-wrf/0-install_zlib_1p2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export WRF_INSTALL=/shared
 4 | module use /shared/arm/modulefiles
 5 | module load acfl armpl
 6 | export OPENMPI_VERSION=4.1.4
 7 | export CC=armclang
 8 | export CXX=armclang++
 9 | export FC=armflang
10 | mkdir -p /shared/tools-acfl && cd /shared/tools-acfl
11 | wget -N http://zlib.net/zlib-1.2.13.tar.gz
12 | tar -xzvf zlib-1.2.13.tar.gz
13 | cd zlib-1.2.13
14 | ./configure --prefix=${WRF_INSTALL}/zlib-acfl
15 | sed -i 's/DPIC/fPIC/g' Makefile
16 | make check && make install
17 | 


--------------------------------------------------------------------------------
/HPC/scripts-wrf/1-install_hdf5_1p12.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export WRF_INSTALL=/shared
 4 | module use /shared/arm/modulefiles
 5 | module load acfl armpl
 6 | export OPENMPI_VERSION=4.1.4
 7 | export PATH=${WRF_INSTALL}/openmpi-${OPENMPI_VERSION}-acfl/bin:$PATH
 8 | export LD_LIBRARY_PATH=${WRF_INSTALL}/openmpi-${OPENMPI_VERSION}-acfl/lib:$LD_LIBRARY_PATH
 9 | export CC=mpicc
10 | export CXX=mpic++
11 | export FC=mpifort
12 | export F90=mpifort
13 | cd /shared/tools-acfl
14 | curl -o hdf5-1.12.0.tar.gz -J -L https://www.hdfgroup.org/package/hdf5-1-12-0-tar-gz/?wpdmdl=14582
15 | tar -xzvf hdf5-1.12.0.tar.gz 
16 | cd hdf5-1.12.0
17 | ./configure --prefix=${WRF_INSTALL}/hdf5-acfl --with-zlib=${WRF_INSTALL}/zlib --enable-parallel --enable-shared --enable-hl --enable-fortran --with-pic
18 | sed -i -e 's#wl=""#wl="-Wl,"#g' libtool
19 | sed -i -e 's#pic_flag=""#pic_flag=" -fPIC -DPIC"#g' libtool
20 | make -j$(nproc) && make install


--------------------------------------------------------------------------------
/HPC/scripts-wrf/2-install_pnetcdf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export WRF_INSTALL=/shared
 4 | module use /shared/arm/modulefiles
 5 | module load acfl armpl
 6 | export OPENMPI_VERSION=4.1.4
 7 | export PATH=${WRF_INSTALL}/openmpi-${OPENMPI_VERSION}-acfl/bin:$PATH
 8 | export LD_LIBRARY_PATH=${WRF_INSTALL}/openmpi-${OPENMPI_VERSION}-acfl/lib:$LD_LIBRARY_PATH
 9 | export CC=armclang
10 | export CXX=armclang++
11 | export FC=armflang
12 | export F77=armflang
13 | export F90=armflang
14 | export MPICC=mpicc
15 | export MPIF77=mpifort
16 | export MPIF90=mpifort
17 | export MPICXX=mpicxx
18 | export CFLAGS="-O3 -fPIC -DPIC"
19 | export CXXFLAGS="-O3 -fPIC -DPIC"
20 | export FFLAGS="-O3 -fPIC"
21 | export FCFLAGS="-O3 -fPIC"
22 | export FLDFLAGS="-fPIC"
23 | export F90LDFLAGS="-fPIC"
24 | export LDFLAGS="-fPIC"
25 | cd /shared/tools-acfl
26 | wget -N https://parallel-netcdf.github.io/Release/pnetcdf-1.12.2.tar.gz
27 | tar -xzvf pnetcdf-1.12.2.tar.gz
28 | cd pnetcdf-1.12.2
29 | ./configure --prefix=${WRF_INSTALL}/pnetcdf-acfl --enable-fortran --enable-large-file-test --enable-shared
30 | make -j$(nproc) && make install
31 | 


--------------------------------------------------------------------------------
/HPC/scripts-wrf/3-install_netcdf_c.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export WRF_INSTALL=/shared
 4 | module use /shared/arm/modulefiles
 5 | module load acfl armpl
 6 | export OPENMPI_VERSION=4.1.4
 7 | export PATH=${WRF_INSTALL}/openmpi-${OPENMPI_VERSION}-acfl/bin:$PATH
 8 | export LD_LIBRARY_PATH=${WRF_INSTALL}/openmpi-${OPENMPI_VERSION}-acfl/lib:$LD_LIBRARY_PATH
 9 | export CC=mpicc
10 | export CXX=mpicxx
11 | export FC=mpif90
12 | export F77=mpif90
13 | export F90=mpif90
14 | HDF5=${WRF_INSTALL}/hdf5-acfl
15 | PNET=${WRF_INSTALL}/pnetcdf-acfl
16 | ZLIB=${WRF_INSTALL}/zlib-acfl
17 | export CPPFLAGS="-I$HDF5/include -I${PNET}/include"
18 | export CFLAGS="-I$HDF5/include -I${PNET}/include"
19 | export CXXFLAGS="-I$HDF5/include -I${PNET}/include"
20 | export FCFLAGS="-I$HDF5/include -I${PNET}/include"
21 | export FFLAGS="-I$HDF5/include -I${PNET}/include"
22 | export LDFLAGS="-I$HDF5/include -I${PNET}/include -L$ZLIB/lib -L$HDF5/lib -L${PNET}/lib"
23 | cd /shared/tools-acfl
24 | wget -N https://downloads.unidata.ucar.edu/netcdf-c/4.8.1/netcdf-c-4.8.1.tar.gz
25 | tar -xzvf netcdf-c-4.8.1.tar.gz
26 | cd netcdf-c-4.8.1
27 | ./configure --prefix=${WRF_INSTALL}/netcdf-acfl CPPFLAGS="-I$HDF5/include -I$PNET/include" CFLAGS="-DHAVE_STRDUP -O3 -march=armv8.2-a+crypto+fp16+rcpc+dotprod" LDFLAGS="-L$HDF5/lib -L$PNET/lib" --enable-pnetcdf --enable-large-file-tests --enable-largefile  --enable-parallel-tests --enable-shared --enable-netcdf-4  --with-pic --disable-doxygen --disable-dap
28 | make -j$(nproc) && make install
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/HPC/scripts-wrf/4-install_netcdf_fortran.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export WRF_INSTALL=/shared
 4 | module use /shared/arm/modulefiles
 5 | module load acfl armpl
 6 | export OPENMPI_VERSION=4.1.4
 7 | export PATH=${WRF_INSTALL}/openmpi-${OPENMPI_VERSION}-acfl/bin:$PATH
 8 | export LD_LIBRARY_PATH=${WRF_INSTALL}/openmpi-${OPENMPI_VERSION}-acfl/lib:$LD_LIBRARY_PATH
 9 | export CC=mpicc
10 | export CXX=mpicxx
11 | export FC=mpif90
12 | export F77=mpif90
13 | export F90=mpif90
14 | HDF5=${WRF_INSTALL}/hdf5-acfl
15 | NCDIR=${WRF_INSTALL}/netcdf-acfl
16 | export LD_LIBRARY_PATH=${NCDIR}/lib:${LD_LIBRARY_PATH}
17 | export CPPFLAGS="-I$HDF5/include -I$NCDIR/include"
18 | export CFLAGS="-I$HDF5/include -I$NCDIR/include"
19 | export CXXFLAGS="-I$HDF5/include -I$NCDIR/include"
20 | export FCFLAGS="-I$HDF5/include -I$NCDIR/include"
21 | export FFLAGS="-I$HDF5/include -I$NCDIR/include"
22 | export LDFLAGS="-L$HDF5/lib -L$NCDIR/lib"
23 | cd /shared/tools-acfl
24 | wget -N https://downloads.unidata.ucar.edu/netcdf-fortran/4.5.4/netcdf-fortran-4.5.4.tar.gz
25 | tar -xzvf netcdf-fortran-4.5.4.tar.gz
26 | cd netcdf-fortran-4.5.4
27 | ./configure --prefix=$NCDIR --disable-static --enable-shared --with-pic --enable-parallel-tests --enable-large-file-tests --enable-largefile
28 | sed -i -e 's#wl=""#wl="-Wl,"#g' libtool
29 | sed -i -e 's#pic_flag=""#pic_flag=" -fPIC -DPIC"#g' libtool
30 | make -j$(nproc) && make install
31 | 


--------------------------------------------------------------------------------
/HPC/scripts-wrf/WRF-v45-patch-acfl.diff:
--------------------------------------------------------------------------------
 1 | diff --git a/arch/configure.defaults b/arch/configure.defaults
 2 | index e7a98362..f3c97669 100644
 3 | --- a/arch/configure.defaults
 4 | +++ b/arch/configure.defaults
 5 | @@ -2126,14 +2126,14 @@ NETCDFPAR_BUILD =       CONFIGURE_NETCDFPAR_BUILD
 6 |  ###########################################################
 7 |  #ARCH    Linux aarch64, GCC compiler OpenMPI # serial smpar dmpar dm+sm
 8 |  #
 9 | -DESCRIPTION     =      GCC ($SFC/$SCC): Aarch64
10 | +DESCRIPTION     =      armclang ($SFC/$SCC): Aarch64
11 |  DMPARALLEL      =
12 |  OMPCPP          =      -fopenmp
13 |  OMP             =      -fopenmp
14 |  OMPCC           =      -fopenmp
15 | -SFC             =      gfortran
16 | -SCC             =      gcc
17 | -CCOMP           =      gcc
18 | +SFC             =      armflang
19 | +SCC             =      armclang
20 | +CCOMP           =      armclang
21 |  DM_FC           =      mpif90
22 |  DM_CC           =      mpicc -DMPI2_SUPPORT
23 |  FC              =      CONFIGURE_FC
24 | @@ -2141,17 +2141,15 @@ CC              =      CONFIGURE_CC
25 |  LD              =      $(FC)
26 |  RWORDSIZE       =      CONFIGURE_RWORDSIZE
27 |  PROMOTION       =
28 | -ARCH_LOCAL      =      -DAARCH64_X86_CORRECTNESS_FIX
29 | +ARCH_LOCAL      =
30 |  CFLAGS_LOCAL    =      -w -O3 -c
31 |  LDFLAGS_LOCAL   =      -fopenmp
32 | -FCOPTIM         =      -Ofast -mcpu=native -funroll-loops -fno-expensive-optimizations -fno-reciprocal-math -fsigned-zeros -fno-unsafe-math-optimizations
33 | -# for graviton 2 use the folowing flag
34 | -#FCOPTIM         =      -Ofast -march=armv8.2-a+fp16+rcpc+dotprod -funroll-loops -fno-expensive-optimizations -fno-reciprocal-math -fsigned-zeros -fno-unsafe-math-optimizations
35 | +FCOPTIM         =      -Ofast -mcpu=native -funroll-loops
36 |  FCREDUCEDOPT    =      $(FCOPTIM)
37 |  FCNOOPT         =      -O0 -fopenmp -frecursive
38 |  FCDEBUG         =      -g $(FCNOOPT)
39 | -FORMAT_FIXED    =      -ffixed-form -ffixed-line-length-0 -fallow-argument-mismatch -fallow-invalid-boz
40 | -FORMAT_FREE     =      -ffree-form -ffree-line-length-0 -fallow-argument-mismatch -fallow-invalid-boz
41 | +FORMAT_FIXED    =      -ffixed-form -ffixed-line-length-0  
42 | +FORMAT_FREE     =      -ffree-form -ffree-line-length-0 
43 |  FCSUFFIX        =
44 |  BYTESWAPIO      =      -fconvert=big-endian -frecord-marker=4
45 |  FCBASEOPTS      =      -w $(FORMAT_FREE) $(BYTESWAPIO)
46 | @@ -2165,6 +2163,7 @@ RANLIB          =      ranlib
47 |  RLFLAGS         =
48 |  CC_TOOLS        =      $(SCC)
49 |  
50 | +
51 |  ###########################################################
52 |  #ARCH    Linux aarch64, NVHPC compiler with nvc # serial smpar dmpar dm+sm
53 |  #
54 | 


--------------------------------------------------------------------------------
/HPC/scripts-wrf/compile-wrf-v45-acfl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # WRF releases can be found at https://github.com/wrf-model/WRF/releases
 3 | # this script will install v4.5 from source (github)
 4 | 
 5 | export WRF_INSTALL=/shared
 6 | export CURDIR=/shared/wrf-arm-v45-acfl
 7 | module use /shared/arm/modulefiles
 8 | module load acfl armpl
 9 | export OPENMPI_VERSION=4.1.4
10 | export PATH=${WRF_INSTALL}/openmpi-${OPENMPI_VERSION}-acfl/bin:$PATH
11 | export LD_LIBRARY_PATH=${WRF_INSTALL}/openmpi-${OPENMPI_VERSION}-acfl/lib:$LD_LIBRARY_PATH
12 | export CC=mpicc
13 | export CXX=mpicxx
14 | export FC=mpif90
15 | export F77=mpif90
16 | export F90=mpif90
17 | export ZLIB=${WRF_INSTALL}/zlib-acfl
18 | export HDF5=${WRF_INSTALL}/hdf5-acfl
19 | export PHDF5=${WRF_INSTALL}/hdf5-acfl
20 | export NETCDF=${WRF_INSTALL}/netcdf-acfl
21 | export PNETCDF=${WRF_INSTALL}/pnetcdf-acfl
22 | export PATH=${NETCDF}/bin:${PATH}
23 | export PATH=${PNETCDF}/bin:${PATH}
24 | export PATH=${HDF5}/hdf5-acfl/bin:${PATH}
25 | export LD_LIBRARY_PATH=${ZLIB}/lib:$LD_LIBRARY_PATH
26 | export LD_LIBRARY_PATH=${NETCDF}/lib:$LD_LIBRARY_PATH
27 | export LD_LIBRARY_PATH=${PNETCDF}/lib:$LD_LIBRARY_PATH
28 | export LD_LIBRARY_PATH=${HDF5}/lib:$LD_LIBRARY_PATH
29 | export WRFIO_NCD_LARGE_FILE_SUPPORT=1
30 | export NETCDF_classic=1
31 | 
32 | mkdir -p ${CURDIR} && cd ${CURDIR}
33 | # get WRF source v45
34 | git clone https://github.com/wrf-model/WRF.git
35 | cd WRF && git checkout release-v4.5
36 | 
37 | # apply a patch for ACFL compiler options
38 | wget https://raw.githubusercontent.com/aws/aws-graviton-getting-started/main/HPC/scripts-wrf/WRF-v45-patch-acfl.diff
39 | git apply WRF-v45-patch-acfl.diff
40 | 
41 | # choose option '12. (dm+sm)   armclang (armflang/armclang): Aarch64' and '1=basic'
42 | ./configure
43 | sed -i 's/(WRF_NMM_CORE)$/(WRF_NMM_CORE)  -Wno-error=implicit-function-declaration -Wno-error=implicit-int/g'  configure.wrf
44 | ./compile -j 1 em_real 2>&1 | tee compile_wrf.out
45 | 


--------------------------------------------------------------------------------
/HPC/scripts-wrf/diffwrf.py:
--------------------------------------------------------------------------------
 1 | from netCDF4 import Dataset
 2 | import sys
 3 | import numpy as np
 4 | from scipy.stats import skew
 5 | from scipy.stats import norm 
 6 | import os.path
 7 | 
 8 | import matplotlib.mlab as mlab
 9 | import matplotlib.pyplot as plt
10 | 
11 | import pylab
12 | 
13 | 
14 | if len(sys.argv) != 3:
15 |         print (" ")
16 |         print ('Usage: ' + sys.argv[0] + ' <filename1.nc>  <filename2.nc>')
17 |         print (" ")
18 |         sys.exit(1)
19 | 
20 | if os.path.exists(sys.argv[1]):
21 |         print ('Found ' + sys.argv[1])
22 | else:
23 |         print (" ")
24 |         print ('File does not exist: ' + sys.argv[1])
25 |         print (" ")
26 |         sys.exit(2)
27 | 
28 | if os.path.exists(sys.argv[2]):
29 |         print ('Found ' + sys.argv[2])
30 | else:
31 |         print (" ")
32 |         print ('File does not exist: ' + sys.argv[2])
33 |         print (" ")
34 |         sys.exit(3)
35 | 
36 | o = Dataset(sys.argv[1])
37 | p = Dataset(sys.argv[2])
38 | 
39 | importantVars = [ 'U', 'V', 'W', 'T', 'PH', 'QVAPOR', 'TSLB', 'MU', 'TSK', 'RAINC', 'RAINNC' ]
40 | 
41 | print (" ")
42 | print ("Differences of the output from two WRF model simulations for a few important fields")
43 | print ("Variable Name                Minimum      Maximum      Average      Std Dev        Skew")
44 | print ("=========================================================================================")
45 | for v in importantVars:
46 | 	diffs = p.variables[v][:] - o.variables[v][:]
47 | 	print ('{0:24s} {1:12g} {2:12g} {3:12g} {4:12g} {5:12g}'.format(v, np.min(diffs[:]),np.max(diffs[:]),np.mean(diffs[:]),np.std(diffs[:]),skew(diffs[:],axis=None)))
48 | 
49 | 	x = np.reshape(diffs,-1)
50 | 	n, bins, patches = plt.hist(x, 200, density=True, facecolor='green', alpha=0.75)
51 | 
52 | 	# add a 'best fit' line
53 | 	y = norm.pdf( bins, np.mean(diffs[:]), np.std(diffs[:]))
54 | 	l = plt.plot(bins, y, 'r--', linewidth=1)
55 | 
56 | 	plt.xlabel('Difference of ' + o.variables[v].name + ' (' + o.variables[v].units + '), points = ' + str(len(x)))
57 | 	plt.ylabel('Probability (%)')
58 | 	plt.title(r'$\mathrm{Histogram:}\ \mu=$'+str(np.mean(diffs[:]))+'$,\ \sigma=$'+str(np.std(diffs[:])))
59 | 
60 | 	plt.axis([np.min(diffs[:]), np.max(diffs[:]), 0, 50])
61 | #	plt.axis([np.mean(diffs[:])-8*np.std(diffs[:]), np.mean(diffs[:])+8*np.std(diffs[:]), 0, 50])
62 | 	plt.grid(True)
63 | 
64 | 	#plt.show()
65 | 
66 | 	pylab.savefig( o.variables[v].name + '.png', bbox_inches='tight')
67 | 	pylab.close()
68 | 


--------------------------------------------------------------------------------
/HPC/scripts-wrf/install-wrf-tools-acfl.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | export WRF_INSTALL=/shared
  5 | module use /shared/arm/modulefiles
  6 | module load acfl armpl
  7 | export OPENMPI_VERSION=4.1.4
  8 | export PATH=${WRF_INSTALL}/openmpi-${OPENMPI_VERSION}-acfl/bin:$PATH
  9 | export LD_LIBRARY_PATH=${WRF_INSTALL}/openmpi-${OPENMPI_VERSION}-acfl/lib:$LD_LIBRARY_PATH
 10 | export CC=armclang
 11 | export CXX=armclang++
 12 | export FC=armflang
 13 | export F77=armflang
 14 | export F90=armflang
 15 | export MPICC=mpicc
 16 | export MPIF77=mpifort
 17 | export MPIF90=mpifort
 18 | export MPICXX=mpicxx
 19 | 
 20 | # zlib
 21 | mkdir -p /shared/tools-acfl && cd /shared/tools-acfl
 22 | wget -N http://zlib.net/zlib-1.2.13.tar.gz
 23 | tar -xzvf zlib-1.2.13.tar.gz
 24 | cd zlib-1.2.13
 25 | ./configure --prefix=${WRF_INSTALL}/zlib-acfl
 26 | sed -i 's/DPIC/fPIC/g' Makefile
 27 | make check && make install
 28 | 
 29 | # hdf5
 30 | export CC=mpicc
 31 | export CXX=mpic++
 32 | export FC=mpifort
 33 | export F90=mpifort
 34 | cd /shared/tools-acfl
 35 | curl -o hdf5-1.12.0.tar.gz -J -L https://www.hdfgroup.org/package/hdf5-1-12-0-tar-gz/?wpdmdl=14582
 36 | tar -xzvf hdf5-1.12.0.tar.gz 
 37 | cd hdf5-1.12.0
 38 | ./configure --prefix=${WRF_INSTALL}/hdf5-acfl --with-zlib=${WRF_INSTALL}/zlib --enable-parallel --enable-shared --enable-hl --enable-fortran --with-pic
 39 | sed -i -e 's#wl=""#wl="-Wl,"#g' libtool
 40 | sed -i -e 's#pic_flag=""#pic_flag=" -fPIC -DPIC"#g' libtool
 41 | make -j$(nproc) && make install
 42 | 
 43 | # pnetcdf
 44 | export CC=armclang
 45 | export CXX=armclang++
 46 | export FC=armflang
 47 | export F77=armflang
 48 | export F90=armflang
 49 | export MPICC=mpicc
 50 | export MPIF77=mpifort
 51 | export MPIF90=mpifort
 52 | export MPICXX=mpicxx
 53 | export CFLAGS="-O3 -fPIC -DPIC"
 54 | export CXXFLAGS="-O3 -fPIC -DPIC"
 55 | export FFLAGS="-O3 -fPIC"
 56 | export FCFLAGS="-O3 -fPIC"
 57 | export FLDFLAGS="-fPIC"
 58 | export F90LDFLAGS="-fPIC"
 59 | export LDFLAGS="-fPIC"
 60 | cd /shared/tools-acfl
 61 | wget -N https://parallel-netcdf.github.io/Release/pnetcdf-1.12.2.tar.gz
 62 | tar -xzvf pnetcdf-1.12.2.tar.gz
 63 | cd pnetcdf-1.12.2
 64 | ./configure --prefix=${WRF_INSTALL}/pnetcdf-acfl --enable-fortran --enable-large-file-test --enable-shared
 65 | make -j$(nproc) && make install
 66 | 
 67 | # netcdf-c
 68 | export CC=mpicc
 69 | export CXX=mpicxx
 70 | export FC=mpif90
 71 | export F77=mpif90
 72 | export F90=mpif90
 73 | HDF5=${WRF_INSTALL}/hdf5-acfl
 74 | PNET=${WRF_INSTALL}/pnetcdf-acfl
 75 | ZLIB=${WRF_INSTALL}/zlib-acfl
 76 | export CPPFLAGS="-I$HDF5/include -I${PNET}/include"
 77 | export CFLAGS="-I$HDF5/include -I${PNET}/include"
 78 | export CXXFLAGS="-I$HDF5/include -I${PNET}/include"
 79 | export FCFLAGS="-I$HDF5/include -I${PNET}/include"
 80 | export FFLAGS="-I$HDF5/include -I${PNET}/include"
 81 | export LDFLAGS="-I$HDF5/include -I${PNET}/include -L$ZLIB/lib -L$HDF5/lib -L${PNET}/lib"
 82 | cd /shared/tools-acfl
 83 | wget -N https://downloads.unidata.ucar.edu/netcdf-c/4.8.1/netcdf-c-4.8.1.tar.gz
 84 | tar -xzvf netcdf-c-4.8.1.tar.gz
 85 | cd netcdf-c-4.8.1
 86 | ./configure --prefix=${WRF_INSTALL}/netcdf-acfl CPPFLAGS="-I$HDF5/include -I$PNET/include" CFLAGS="-DHAVE_STRDUP -O3 -march=armv8.2-a+crypto+fp16+rcpc+dotprod" LDFLAGS="-L$HDF5/lib -L$PNET/lib" --enable-pnetcdf --enable-large-file-tests --enable-largefile  --enable-parallel-tests --enable-shared --enable-netcdf-4  --with-pic --disable-doxygen --disable-dap
 87 | make -j$(nproc) && make install
 88 | 
 89 | # netcdf-fortran
 90 | NCDIR=${WRF_INSTALL}/netcdf-acfl
 91 | export LD_LIBRARY_PATH=${NCDIR}/lib:${LD_LIBRARY_PATH}
 92 | export CPPFLAGS="-I$HDF5/include -I$NCDIR/include"
 93 | export CFLAGS="-I$HDF5/include -I$NCDIR/include"
 94 | export CXXFLAGS="-I$HDF5/include -I$NCDIR/include"
 95 | export FCFLAGS="-I$HDF5/include -I$NCDIR/include"
 96 | export FFLAGS="-I$HDF5/include -I$NCDIR/include"
 97 | export LDFLAGS="-L$HDF5/lib -L$NCDIR/lib"
 98 | cd /shared/tools-acfl
 99 | wget -N https://downloads.unidata.ucar.edu/netcdf-fortran/4.5.4/netcdf-fortran-4.5.4.tar.gz
100 | tar -xzvf netcdf-fortran-4.5.4.tar.gz
101 | cd netcdf-fortran-4.5.4
102 | ./configure --prefix=$NCDIR --disable-static --enable-shared --with-pic --enable-parallel-tests --enable-large-file-tests --enable-largefile
103 | sed -i -e 's#wl=""#wl="-Wl,"#g' libtool
104 | sed -i -e 's#pic_flag=""#pic_flag=" -fPIC -DPIC"#g' libtool
105 | make -j$(nproc) && make install
106 | 


--------------------------------------------------------------------------------
/HPC/scripts-wrf/netcdf-c-success-message.txt:
--------------------------------------------------------------------------------
 1 | +-------------------------------------------------------------+
 2 | | Congratulations! You have successfully installed netCDF!    |
 3 | |                                                             |
 4 | | You can use script "nc-config" to find out the relevant     |
 5 | | compiler options to build your application. Enter           |
 6 | |                                                             |
 7 | |     nc-config --help                                        |
 8 | |                                                             |
 9 | | for additional information.                                 |
10 | |                                                             |
11 | | CAUTION:                                                    |
12 | |                                                             |
13 | | If you have not already run "make check", then we strongly  |
14 | | recommend you do so. It does not take very long.            |
15 | |                                                             |
16 | | Before using netCDF to store important data, test your      |
17 | | build with "make check".                                    |
18 | |                                                             |
19 | | NetCDF is tested nightly on many platforms at Unidata       |
20 | | but your platform is probably different in some ways.       |
21 | |                                                             |
22 | | If any tests fail, please see the netCDF web site:          |
23 | | http://www.unidata.ucar.edu/software/netcdf/                |
24 | |                                                             |
25 | | NetCDF is developed and maintained at the Unidata Program   |
26 | | Center. Unidata provides a broad array of data and software |
27 | | tools for use in geoscience education and research.         |
28 | | http://www.unidata.ucar.edu                                 |
29 | +-------------------------------------------------------------+


--------------------------------------------------------------------------------
/HPC/scripts-wrf/netcdf-fortran-success-message.txt:
--------------------------------------------------------------------------------
 1 | +-------------------------------------------------------------+
 2 | | Congratulations! You have successfully installed the netCDF |
 3 | | Fortran libraries.                                          |
 4 | |                                                             |
 5 | | You can use script "nf-config" to find out the relevant     |
 6 | | compiler options to build your application. Enter           |
 7 | |                                                             |
 8 | |     nf-config --help                                        |
 9 | |                                                             |
10 | | for additional information.                                 |
11 | |                                                             |
12 | | CAUTION:                                                    |
13 | |                                                             |
14 | | If you have not already run "make check", then we strongly  |
15 | | recommend you do so. It does not take very long.            |
16 | |                                                             |
17 | | Before using netCDF to store important data, test your      |
18 | | build with "make check".                                    |
19 | |                                                             |
20 | | NetCDF is tested nightly on many platforms at Unidata       |
21 | | but your platform is probably different in some ways.       |
22 | |                                                             |
23 | | If any tests fail, please see the netCDF web site:          |
24 | | https://www.unidata.ucar.edu/software/netcdf/                |
25 | |                                                             |
26 | | NetCDF is developed and maintained at the Unidata Program   |
27 | | Center. Unidata provides a broad array of data and software |
28 | | tools for use in geoscience education and research.         |
29 | | https://www.unidata.ucar.edu                                 |
30 | +-------------------------------------------------------------+


--------------------------------------------------------------------------------
/HPC/scripts-wrf/pnetcdf-success-message.txt:
--------------------------------------------------------------------------------
 1 | +----------------------------------------------------------------------------+
 2 | |
 3 | |  PnetCDF has been successfully installed under
 4 | |          /shared/pnetcdf-acfl
 5 | |
 6 | |  * PnetCDF header files have been installed in
 7 | |          /shared/pnetcdf-acfl/include
 8 | |  * PnetCDF library files have been installed in
 9 | |          /shared/pnetcdf-acfl/lib
10 | |  * PnetCDF utility programs have been installed in
11 | |          /shared/pnetcdf-acfl/bin
12 | |  * PnetCDF man pages have been installed in
13 | |          /shared/pnetcdf-acfl/share/man
14 | |
15 | |  To compile your PnetCDF programs, please add the following to the command
16 | |  line, so the compiler can find the PnetCDF header files:
17 | |      -I/shared/pnetcdf-acfl/include
18 | |
19 | |  Add the following line to link your program to PnetCDF library:
20 | |      -L/shared/pnetcdf-acfl/lib -lpnetcdf
21 | |
22 | |  Add the following to your run-time environment variable LD_LIBRARY_PATH,
23 | |  when linking your executable with the PnetCDF shared libraries.
24 | |      /shared/pnetcdf-acfl/lib
25 | |
26 | |
27 | |  PnetCDF is jointly developed by a team at Northwestern University and
28 | |  Argonne National Laboratory.
29 | |
30 | |  Visit PnetCDF project web site for more information
31 | |      https://parallel-netcdf.github.io
32 | |
33 | +----------------------------------------------------------------------------+


--------------------------------------------------------------------------------
/HPC/scripts-wrf/sbatch-wrf-v45-acfl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --wait-all-nodes=1
 3 | #SBATCH --ntasks-per-node=8
 4 | #SBATCH --cpus-per-task=1
 5 | #SBATCH --nodes=1
 6 | #SBATCH --ntasks-per-core=1
 7 | #SBATCH --export=ALL
 8 | #SBATCH --partition=compute
 9 | #SBATCH --exclusive
10 | 
11 | #ENV VARIABLES#
12 | 
13 | #---------------------Run-time env-----------------------------------------
14 | ulimit -s unlimited
15 | 
16 | export OMP_STACKSIZE=12G
17 | export OMP_NUM_THREADS=8
18 | export FI_EFA_FORK_SAFE=1
19 | wrf_root=/shared
20 | wrf_install=${wrf_root}
21 | module use /shared/arm/modulefiles
22 | module load acfl armpl
23 | 
24 | export PATH=${wrf_install}/openmpi-4.1.4-acfl/bin:$PATH
25 | export LD_LIBRARY_PATH=${wrf_install}/openmpi-4.1.4-acfl/lib:$LD_LIBRARY_PATH
26 | 
27 | export LD_LIBRARY_PATH=${wrf_install}/netcdf-acfl/lib:$LD_LIBRARY_PATH
28 | export LD_LIBRARY_PATH=${wrf_install}/pnetcdf-acfl/lib:$LD_LIBRARY_PATH
29 | export LD_LIBRARY_PATH=${wrf_install}/hdf5-acfl/lib:$LD_LIBRARY_PATH
30 | 
31 | #--------------------------------------------------------------------------
32 | mkdir -p /shared/data-wrf && cd /shared/data-wrf
33 | wget https://www2.mmm.ucar.edu/wrf/src/conus12km.tar.gz
34 | tar xf conus12km.tar.gz
35 | cd conus12km
36 | cp ${wrf_install}/wrf-arm-v45-acfl/WRF/run/*.TBL .
37 | cp ${wrf_install}/wrf-arm-v45-acfl/WRF/run/*.formatted .
38 | cp ${wrf_install}/wrf-arm-v45-acfl/WRF/run/RRTMG* .
39 | cp ${wrf_install}/wrf-arm-v45-acfl/WRF/run/CAMtr_volume_mixing_ratio* .
40 | ln -s ${wrf_install}/wrf-arm-v45-acfl/WRF/main/wrf.exe wrf-v45-acfl.exe
41 | 
42 | echo "Running WRF on $(date)"
43 | echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope
44 | 
45 | date -u +%Y-%m-%d_%H:%M:%S >> wrf.times
46 | mpirun --map-by socket:PE=8 --bind-to core ./wrf-v45-acfl.exe &>> wrf.out
47 | echo nstasks=$SLURM_NTASKS
48 | date -u +%Y-%m-%d_%H:%M:%S >> wrf.times
49 | 


--------------------------------------------------------------------------------
/HPC/scripts-wrf/scripts-wps/0-install_jasper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export INSTALLDIR=/shared
 4 | export OPENMPI_VERSION=4.1.4
 5 | export PATH=${INSTALLDIR}/openmpi-${OPENMPI_VERSION}-acfl/bin:$PATH
 6 | export LD_LIBRARY_PATH=${INSTALLDIR}/openmpi-${OPENMPI_VERSION}-acfl/lib:$LD_LIBRARY_PATH
 7 | export CC=mpicc
 8 | export CXX=mpicxx
 9 | export FC=mpifort
10 | export F77=mpifort
11 | export F90=mpifort
12 | export CFLAGS="-g -O2 -fPIC -Wno-error=implicit-function-declaration -Wno-error=implicit-int -Wno-error=incompatible-function-pointer-types"
13 | export CXXFLAGS="-g -O2 -fPIC -Wno-error=implicit-function-declaration -Wno-error=implicit-int -Wno-error=incompatible-function-pointer-types"
14 | export FFLAGS="-g -fPIC"
15 | export FCFLAGS="-g -fPIC"
16 | export FLDFLAGS="-fPIC"
17 | export F90LDFLAGS="-fPIC"
18 | export LDFLAGS="-fPIC"
19 | 
20 | module use ${INSTALLDIR}/arm/modulefiles
21 | module load acfl armpl
22 | 
23 | cd ${INSTALLDIR}/tools
24 | wget https://www2.mmm.ucar.edu/wrf/OnLineTutorial/compile_tutorial/tar_files/jasper-1.900.1.tar.gz
25 | tar xf jasper-1.900.1.tar.gz
26 | cd jasper-1.900.1
27 | wget -N -O acaux/config.guess "http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD"
28 | 
29 | ./configure --prefix=${INSTALLDIR}/jasper
30 | 
31 | make -j$(nproc) && make install | tee jasper_out.log
32 | 


--------------------------------------------------------------------------------
/HPC/scripts-wrf/scripts-wps/compile-wps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export INSTALLDIR=/shared
 4 | export CURDIR=${INSTALLDIR}/wps-arm-v45-acfl
 5 | export OPENMPI_VERSION=4.1.4
 6 | export PATH=$PATH
 7 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH
 8 | export PATH=${INSTALLDIR}/openmpi-${OPENMPI_VERSION}-acfl/bin:$PATH
 9 | export LD_LIBRARY_PATH=${INSTALLDIR}/openmpi-${OPENMPI_VERSION}-acfl/lib:$LD_LIBRARY_PATH
10 | export CC=mpicc
11 | export CXX=mpicxx
12 | export FC=mpif90
13 | export F77=mpif90
14 | export F90=mpif90
15 | export HDF5=${INSTALLDIR}/hdf5-acfl
16 | export PHDF5=${INSTALLDIR}/hdf5-acfl
17 | export NETCDF=${INSTALLDIR}/netcdf-acfl
18 | export PNETCDF=${INSTALLDIR}/pnetcdf-acfl
19 | export PATH=${INSTALLDIR}/netcdf-acfl/bin:${PATH}
20 | export PATH=${INSTALLDIR}/pnetcdf-acfl/bin:${PATH}
21 | export PATH=${INSTALLDIR}/hdf5-acfl/bin:${PATH}
22 | export PATH=$PATH:/fsx/jasper/bin
23 | export LD_LIBRARY_PATH=${INSTALLDIR}/netcdf-acfl/lib:$LD_LIBRARY_PATH
24 | export LD_LIBRARY_PATH=${INSTALLDIR}/pnetcdf-acfl/lib:$LD_LIBRARY_PATH
25 | export LD_LIBRARY_PATH=${INSTALLDIR}/hdf5-acfl/lib:$LD_LIBRARY_PATH
26 | export WRFIO_NCD_LARGE_FILE_SUPPORT=1
27 | export NETCDF_classic=1
28 | export JASPERLIB=${INSTALLDIR}/jasper/lib
29 | export JASPERINC=${INSTALLDIR}/jasper/include
30 | export WRF_DIR=${INSTALLDIR}/wrf-arm-v45-acfl/WRF
31 | 
32 | module use ${INSTALLDIR}/arm/modulefiles
33 | module load acfl armpl
34 | 
35 | mkdir -p ${CURDIR} && cd ${CURDIR}
36 | wget https://github.com/wrf-model/WPS/archive/refs/tags/v4.5.tar.gz
37 | tar xf v4.5.tar.gz
38 | cd WPS-4.5
39 | 
40 | cat >> arch/configure.defaults << EOL
41 | ########################################################################################################################
42 | #ARCH Linux aarch64, Arm compiler OpenMPI # serial smpar dmpar dm+sm
43 | #
44 | COMPRESSION_LIBS    = CONFIGURE_COMP_L
45 | COMPRESSION_INC     = CONFIGURE_COMP_I
46 | FDEFS               = CONFIGURE_FDEFS
47 | SFC                 = armflang
48 | SCC                 = armclang
49 | DM_FC               = mpif90
50 | DM_CC               = mpicc -DMPI2_SUPPORT
51 | FC                  = CONFIGURE_FC
52 | CC                  = CONFIGURE_CC
53 | LD                  = $(FC)
54 | FFLAGS              = -ffree-form -O -fconvert=big-endian -frecord-marker=4 -ffixed-line-length-0 -Wno-error=implicit-function-declaration -Wno-error=implicit-int -Wno-error=incompatible-function-pointer-types
55 | F77FLAGS            = -ffixed-form -O -fconvert=big-endian -frecord-marker=4 -ffree-line-length-0 -Wno-error=implicit-function-declaration -Wno-error=implicit-int -Wno-error=incompatible-function-pointer-types
56 | FCSUFFIX            =
57 | FNGFLAGS            = $(FFLAGS)
58 | LDFLAGS             =
59 | CFLAGS              = -Wno-error=implicit-function-declaration -Wno-error=implicit-int -Wno-error=incompatible-function-pointer-types
60 | CPP                 = /usr/bin/cpp -P -traditional
61 | CPPFLAGS            = -D_UNDERSCORE -DBYTESWAP -DLINUX -DIO_NETCDF -DBIT32 -DNO_SIGNAL CONFIGURE_MPI
62 | RANLIB              = ranlib
63 | EOL
64 | 
65 | ./configure <<< 2
66 | sed -i 's/-lnetcdf/-lnetcdf -lnetcdff -lgomp /g' configure.wps
67 | 
68 | ./compile | tee compile_wps.log
69 | 


--------------------------------------------------------------------------------
/HPC/setup-an-ec2-hpc-instance.md:
--------------------------------------------------------------------------------
 1 | ## Compile instructions on an EC2
 2 | If you are a developer who want to build and test your applications on Graviton. You can get started by launching a Ubuntu 20.04 C7g instance (4xlarge or larger) from the console. Follow the procedures below to set up the tools:
 3 | ```
 4 | # get the build tools and upgrade GCC
 5 | sudo apt update -y
 6 | sudo apt install build-essential environment-modules cmake m4 zlib1g zlib1g-dev csh unzip flex -y
 7 | 
 8 | sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
 9 | sudo apt install -y gcc-11 g++-11 gfortran-11 -y
10 | sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 --slave /usr/bin/g++ g++ /usr/bin/g++-11 --slave /usr/bin/gcov gcov /usr/bin/gcov-11 --slave /usr/bin/gfortran gfortran /usr/bin/gfortran-11
11 | ```
12 | You can check by `gcc --version` to confirm that you have gcc 11.4.0 installed.
13 | 
14 | ```
15 | # install Arm Compiler for Linux, Arm Performance Libraries under /shared
16 | sudo mkdir -p /shared/tools/
17 | sudo chown -R ubuntu: /shared
18 | 
19 | # check Arm's website for the latest link
20 | cd /shared/tools
21 | wget -O arm-compiler-for-linux_23.04.1_Ubuntu-20.04_aarch64.tar 'https://developer.arm.com/-/media/Files/downloads/hpc/arm-compiler-for-linux/23-04-1/arm-compiler-for-linux_23.04.1_Ubuntu-20.04_aarch64.tar'
22 | tar xf arm-compiler-for-linux_23.04.1_Ubuntu-20.04_aarch64.tar
23 | ./arm-compiler-for-linux_23.04.1_Ubuntu-20.04/arm-compiler-for-linux_23.04.1_Ubuntu-20.04.sh \
24 | -i /shared/arm -a --force
25 | ```
26 | 
27 | You can check if the Arm Compiler and Armpl are installed and loaded properly by the following commands:
28 | ```
29 | source /etc/profile.d/modules.sh
30 | module use /shared/arm/modulefiles
31 | module av
32 | module load acfl armpl
33 | module list
34 | ```
35 | You should be getting the following messages if the installation is successful.
36 | ```
37 | Currently Loaded Modulefiles:
38 |  1) binutils/12.2.0   2) acfl/23.04.1   3) armpl/23.04.1
39 | ```
40 | 
41 | After that, you need to install EFA driver which is not installed on an EC2 instance by default and [Open MPI](README.md#open-mpi).
42 | ```
43 | # install EFA, Open MPI under /shared
44 | cd /shared/tools
45 | curl -O https://efa-installer.amazonaws.com/aws-efa-installer-1.25.0.tar.gz
46 | tar xf aws-efa-installer-1.25.0.tar.gz
47 | cd aws-efa-installer
48 | sudo ./efa_installer.sh -y
49 | ```
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/LICENSE-SAMPLECODE:
--------------------------------------------------------------------------------
 1 | Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this
 4 | software and associated documentation files (the "Software"), to deal in the Software
 5 | without restriction, including without limitation the rights to use, copy, modify,
 6 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
 7 | permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 


--------------------------------------------------------------------------------
/LICENSE-SUMMARY:
--------------------------------------------------------------------------------
1 | Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | 
3 | The documentation is made available under the Creative Commons Attribution-ShareAlike 4.0 International License. See the LICENSE file.
4 | 
5 | The sample code within this documentation is made available under the MIT-0 license. See the LICENSE-SAMPLECODE file.
6 | 


--------------------------------------------------------------------------------
/Monitoring_Tools_on_Graviton.md:
--------------------------------------------------------------------------------
  1 | # Monitoring tools for AWS Graviton
  2 | Listed below are some monitoring and profiling tools supported on AWS Gravtion. Also listed are some differences when compared to the tools available on x86 processor architectures.
  3 | 
  4 | Some of the most commonly used tools such as _top, htop, iostat, lstopo, hwloc, dmidecode, lmbench, Linux perf_ are supported on AWS Graviton processors. There are some tools such as Intel MLC, Intel VTune Profiler, PCM that are supported only on Intel processors and some tools such as _turbostat_ supported on x86 processors.
  5 | 
  6 | ## Details
  7 | ### Info utilities
  8 | #### *lscpu* utility
  9 | The *lscpu* utility shows details of the processor features such as architecture, number of cores, active CPUs, caches, NUMA, instruction support and optionally CPU frequency.
 10 | 
 11 | *$ lscpu*
 12 | |Attribute| Value|
 13 | |---    |---    |
 14 | |Architecture| aarch64|
 15 | |CPU(s)| 64|
 16 | |On-line CPU(s) list| 0-63|
 17 | |L1d| 4 MiB (64 instances)|
 18 | |...|...|
 19 | |NUMA node(s)|  1|
 20 | |...|...|
 21 | 
 22 | #### *dmidecode* to get CPU frequency info
 23 | The *dmidecode* utility is a tool for listing details of the system's hardware components.
 24 | 
 25 | *$ sudo dmidecode | less*
 26 | |Attribute| Value|
 27 | |---    |---    |
 28 | |Socket Designation| CPU00|
 29 | |Type| Central Processor|
 30 | |Family| ARMv8|
 31 | |Manufacturer| AWS|
 32 | |...|...|
 33 | |Max Speed| 2600 MHz|
 34 | |Current Speed| 2600 MHz|
 35 | |...|...|
 36 | 
 37 | #### AWS API to get CPU frequency information
 38 | The AWS EC2 API also allows to query an instance type processor maximum frequency.
 39 | 
 40 | Below is an example using the AWS CLI to query the processor frequency of a Graviton3-based c7g.4xlarge:
 41 | ```
 42 | $ aws ec2 describe-instance-types --instance-types c7g.4xlarge --query "InstanceTypes[].{InstanceType:InstanceType,SustainedClockSpeedInGhz:ProcessorInfo.SustainedClockSpeedInGhz}" --output json
 43 | [
 44 |     {
 45 |         "InstanceType": "c7g.4xlarge",
 46 |         "SustainedClockSpeedInGhz": 2.6
 47 |     }
 48 | ]
 49 | ```
 50 | 
 51 | #### *hwloc* and *lstopo* utilities
 52 | Shown below is sample output for these utilities on a c6g.2xlarge instance.
 53 | 
 54 | *$ hwloc-info*
 55 | 
 56 |     depth 0:           1 Machine (type #0)
 57 |         depth 1:          1 Package (type #1)
 58 |             depth 2:         1 L3Cache (type #6)
 59 |                 depth 3:        8 L2Cache (type #5)
 60 |                     depth 4:       8 L1dCache (type #4)
 61 |                         depth 5:      8 L1iCache (type #9)
 62 |             ...
 63 | 
 64 | 
 65 | *$ lstopo*
 66 | 
 67 |     Machine (15GB total)
 68 |         Package L#0
 69 |             NUMANode L#0 (P#0 15GB)
 70 |                 L3 L#0 (32MB)
 71 |                 L2 L#0 (1024KB) + L1d L#0 (64KB) + L1i L#0 (64KB) + Core L#0 + PU L#0 (P#0)
 72 |                 L2 L#1 (1024KB) + L1d L#1 (64KB) + L1i L#1 (64KB) + Core L#1 + PU L#1 (P#1)
 73 |                 L2 L#2 (1024KB) + L1d L#2 (64KB) + L1i L#2 (64KB) + Core L#2 + PU L#2 (P#2)
 74 |         ...
 75 | 
 76 | ### Perf monitoring utilities
 77 | On AWS Graviton processors, the **Linux perf** tool comes handy to collect an application execution profile, hardware perf counters. Much of the functionality provided by tools such as Intel *VTune Profiler* and *PCM* are supported in *Linux perf*. Below are some details of its usage/ syntax.
 78 | 
 79 | #### Collect basic CPU statistics for the specified command or system wide
 80 | *$ perf stat command*
 81 | 
 82 | Shown below are *Linux perf* stats collected at system wide on a c6g.2xlarge instance.
 83 | 
 84 | *$ perf stat*
 85 | 
 86 |  Performance counter stats for 'system wide':
 87 | 
 88 |           87692.26 msec cpu-clock                 #    8.000 CPUs utilized
 89 |                441      context-switches          #    5.029 /sec
 90 |                 13      cpu-migrations            #    0.148 /sec
 91 |                  2      page-faults               #    0.023 /sec
 92 |           25115021      cycles                    #    0.000 GHz
 93 |           28853592      instructions              #    1.15  insn per cycle
 94 |              68126      branch-misses
 95 | 
 96 |       10.961122204 seconds time elapsed
 97 | 
 98 | #### Collect basic/ specific CPU hardware counters, for a specific command or system wide, for 10 seconds
 99 | One can collect hardware events/ counters for an application, on a specific CPU, for a PID or system wide as follows:
100 | 
101 | *$ perf stat -e cycles,instructions,cache-references,cache-misses,bus-cycles -a sleep 10*
102 | 
103 |  Performance counter stats for 'system wide':
104 | 
105 |          161469308      cycles                                                        (80.01%)
106 |          120685678      instructions              #    0.75  insn per cycle           (79.97%)
107 |           42132948      cache-references                                              (80.01%)
108 |            2001520      cache-misses              #    4.750 % of all cache refs      (80.02%)
109 |          160016796      bus-cycles                                                    (60.00%)
110 | 
111 |       10.002896494 seconds time elapsed
112 | 
113 | #### View the profile using perf report command
114 | *$ perf report*
115 | |Overhead  |   Command  |       Shared Object   |       Symbol|
116 | |---	|---	|---	|---    |
117 | |72.44%   |       functionA |       functionA       |       classA::functionA|
118 | |7.66%    |       functionB |       libB.so       |       classB::functionB|
119 | |...      |                 |                       |                 |
120 | |0.81%    |       functioA  |       libc-2.31.so    |       memcmp|
121 | 
122 | More details on how to use Linux perf utility on AWS Graviton processors is available [here](https://github.com/aws/aws-graviton-getting-started/blob/main/optimizing.md#profiling-the-code).
123 | 
124 | ## Summary: Utilities on AWS Graviton vs. Intel x86 architectures
125 | |Processor	|x86	|Graviton2,3, and 4	|
126 | |---	|---	|---	|
127 | |CPU frequency listing	|*lscpu, /proc/cpuinfo, dmidecode*	|*dmidecode*	|
128 | |*turbostat* support	|Yes	|No	|
129 | |*hwloc* support	|Yes	|Yes	|
130 | |*lstopo* support	|Yes	|Yes	|
131 | |*i7z* Works	|Yes	|No	|
132 | |*lmbench*	|Yes	|Yes	|
133 | |Intel *MLC*  |Yes    |No     |
134 | |Performance monitoring tools	|_[VTune Profiler](https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html), [PCM](https://github.com/opcm/pcm), [Linux perf](https://www.brendangregg.com/perf.html), [APerf](https://github.com/aws/aperf)_	|_[Linux perf](https://www.brendangregg.com/perf.html), [Linaro Forge](https://www.linaroforge.com/), [Arm Streamline CLI Tools](https://developer.arm.com/Tools%20and%20Software/Streamline%20Performance%20Analyzer), [APerf](https://github.com/aws/aperf)_	|
135 | 
136 | Utilities such as *lmbench* are available [here](http://lmbench.sourceforge.net/) and can be built for AWS Graviton processors to obtain latency and bandwidth stats.
137 | 
138 | **Notes**:
139 | 
140 | **1.** The ARM Linux kernel community has decided not to put CPU frequency in _/proc/cpuinfo_ which can be read by tools such as _lscpu_ or directly.
141 | 
142 | **2.** On AWS Graviton processors, Turbo isn’t supported. So, utilities such as ‘turbostat’ aren’t supported/ relevant. Also, tools such as *[i7z](https://code.google.com/archive/p/i7z/)* for discovering CPU frequency, turbo, sockets and other information are only supported on Intel architecture/ processors. Intel *MLC* is a memory latency checker utility that is only supported on Intel processors.
143 | 


--------------------------------------------------------------------------------
/R.md:
--------------------------------------------------------------------------------
 1 | # R on Graviton
 2 | **Introduction**
 3 | 
 4 | R is a free software environment for statistical computing and graphics. It compiles and runs on a wide variety of platforms (including arm64).  _[Read more on the R Project page]([(https://www.r-project.org)])(https://www.r-project.org)_.
 5 | This page is meant to discuss differences between running R on Graviton versus other platforms, not to give instructions for R in general.
 6 | 
 7 | ## 1. Installing R
 8 | Because of its use cases, performance is a consideration for R.  For that reason, using Amazon Linux is recommended because it is regularly updated to include Graviton related optimizations.  
 9 | 
10 | All instructions here are tested using the Amazon Linux distribution (specifically the 2023 version). Other than the package manager (yum/apt), they should work on other distributions as well.
11 | 
12 | As on most platforms, the easiest way to install R is using the built in package manager.
13 | 
14 | ```sudo yum install R```
15 | 
16 | This will install R.  However, as is also the case on most platforms, the package manager doesn't always have the latest version.  If you need a more current version, you would need to install manually from the source.   
17 | 
18 | ## 2. Installing R packages
19 | CRAN (the default R package repository) hosts most packages as source code.  This means that installing a package using the built in package manager (install.packages) will automatically download the source and compile it for your platform. This works well on the Graviton platform because it creates the binaries you need and also lets you take advantage of processor optimizations that may be compiled in.
20 | 
21 | Packages may not install because of missing libraries.  In most cases, install.packages will show you the missing packages that provide those libraries.  If too many things scrolled by on the screen, run
22 | 
23 | ```>warnings()``` 
24 | 
25 | from the R command prompt to review.
26 | 
27 | There are some packages that need to be installed a little differently on Graviton because their installation includes binary distribution.
28 | 
29 | **For example:** 
30 | 
31 | ```>install.packages(c("devtools"), dependencies=TRUE) ```
32 | 
33 | will tell you that you need to first install libcurl and openssl.  For Amazon Linux, use the package names listed on the ```\* rpm:``` line.
34 | 
35 | In this case:
36 | 
37 | ```
38 | sudo yum install openssl-devel
39 | sudo yum install libcurl-devel
40 | ```
41 | However, one of the required packages, gert, will tell you it needs ```libgit2-devel```.  You don't see this in the installation on x86 because the gert install package includes a script that downloads a static linked binary if it doesn't find the needed library.
42 | 
43 | ```libgit2-devel``` is not currently available through yum, so you need to install manually.
44 | 
45 | In order to do that, you may need two additional packages, ```cmake``` and ```git```.  **You also need to use the install prefix of /usr instead of /usr/local**
46 | 
47 | From the linux command line:
48 | ```
49 | sudo yum install cmake
50 | sudo yum install git
51 | git clone https://github.com/libgit2/libgit2
52 | cd libgit2
53 | mkdir build && cd build
54 | sudo cmake .. -DCMAKE_INSTALL_PREFIX=/usr
55 | sudo cmake --build . --target install
56 | cd ..
57 | rm -rf libgit2
58 | ```
59 | 
60 | After that, you can return to R and run 
61 | 
62 | ```>install.packages(c("devtools"), dependencies=TRUE) ```
63 | 
64 | and it should complete.
65 | 
66 | ## 3. Compiled code use
67 | 
68 | Any R package or program that uses compiled code will probably need to have that code recompiled.  Refer to _[Using compiled code]([(https://cran.r-project.org/web/packages/box/vignettes/compiled-code.html)])(https://cran.r-project.org/web/packages/box/vignettes/compiled-code.html)_ on the R Project site to see examples of what compiled code use may look like.
69 | 


--------------------------------------------------------------------------------
/amis_cf_sm.md:
--------------------------------------------------------------------------------
 1 | # Amazon Machine Images (AMIs)
 2 | 
 3 | This document covers how to find AMIs compatible with AWS Graviton, and how to look up and use the AMIs in AWS System Manager and AWS CloudFormation.
 4 | 
 5 | ## Using the console
 6 | 
 7 | AMIs can both be found in the console as explained in the [AWS documentation](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/finding-an-ami.html#finding-an-ami-console)
 8 | when launching instances interactively.
 9 | 
10 | ## Using APIs - AWS Systems Manager Parameter Store
11 | 
12 | When integrating the AMI lookup process in a script or in a piece of code, it is more convenient to leverage [AWS Systems Manager](https://aws.amazon.com/systems-manager/) Parameter Store.
13 | 
14 | There's a good article about it on the AWS Compute blog: [Query for the latest Amazon Linux AMI IDs using AWS Systems Manager Parameter Store](https://aws.amazon.com/blogs/compute/query-for-the-latest-amazon-linux-ami-ids-using-aws-systems-manager-parameter-store/).
15 | 
16 | |OS release|Parameter name|
17 | |----------|--------------|
18 | |Amazon Linux 2023|`/aws/service/ami-amazon-linux-latest/al2023-ami-kernel-default-arm64`|
19 | |Amazon Linux 2023 minimal|`/aws/service/ami-amazon-linux-latest/al2023-ami-minimal-kernel-default-arm64`|
20 | |Amazon Linux 2|`/aws/service/ami-amazon-linux-latest/amzn2-ami-hvm-arm64-gp2`|
21 | |Ubuntu 24.04|`/aws/service/canonical/ubuntu/server/24.04/stable/current/arm64/hvm/ebs-gp3/ami-id`|
22 | |Ubuntu 22.04|`/aws/service/canonical/ubuntu/server/22.04/stable/current/arm64/hvm/ebs-gp2/ami-id`|
23 | |Ubuntu 20.04|`/aws/service/canonical/ubuntu/server/20.04/stable/current/arm64/hvm/ebs-gp2/ami-id`|
24 | |Debian 12|`/aws/service/debian/release/12/latest/arm64`|
25 | |Debian 11|`/aws/service/debian/release/11/latest/arm64`|
26 | |SLES 15 SP6|`/aws/service/suse/sles/15-sp6/arm64/latest`|
27 | 
28 | Here is an example to get the AMI ID of the latest **Amazon Linux 2023** version in the us-east-1 region:
29 | 
30 | ```sh
31 | $ aws ssm get-parameter --name /aws/service/ami-amazon-linux-latest/al2023-ami-kernel-default-arm64 --region us-east-1 --query Parameter.Value --output text
32 | ```
33 | 
34 | Here is an example to get the AMI ID of the latest **Amazon Linux 2** version in the us-east-1 region:
35 | 
36 | ```sh
37 | $ aws ssm get-parameter --name /aws/service/ami-amazon-linux-latest/amzn2-ami-hvm-arm64-gp2 --region us-east-1 --query Parameter.Value --output text
38 | ```
39 | 
40 | AWS CloudFormation also supports AWS Systems Manager Parameter Store for obtaining AMI IDs without
41 | hard-coding them.
42 | 
43 | Here is an example demonstrating how to refer to the latest **Amazon Linux 2023 AMI** for Graviton/arm64 in a CloudFormation template:
44 | 
45 | ```yaml
46 | Parameters:
47 |   LatestAmiId:
48 |     Type: AWS::SSM::Parameter::Value<AWS::EC2::Image::Id>
49 |     Default: /aws/service/ami-amazon-linux-latest/al2023-ami-kernel-default-arm64
50 | 
51 | Resources:
52 |  GravitonInstance:
53 |     Type: AWS::EC2::Instance
54 |     Properties:
55 |       ImageId: !Ref LatestAmiId
56 |       InstanceType: c7g.medium
57 | ```
58 | 
59 | 
60 | Here is an example demonstrating how to refer to the latest **Amazon Linux 2** AMI for Graviton/arm64 in a CloudFormation template:
61 | 
62 | ```yaml
63 | Parameters:
64 |   LatestAmiId:
65 |     Type: AWS::SSM::Parameter::Value<AWS::EC2::Image::Id>
66 |     Default: /aws/service/ami-amazon-linux-latest/amzn2-ami-hvm-arm64-gp2
67 | 
68 | Resources:
69 |  GravitonInstance:
70 |     Type: AWS::EC2::Instance
71 |     Properties:
72 |       ImageId: !Ref LatestAmiId
73 |       InstanceType: c7g.medium
74 | ```
75 | 


--------------------------------------------------------------------------------
/aws-lambda/GravitonLambdaNumber/src/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM public.ecr.aws/lambda/nodejs:20-arm64
2 | COPY app.js package*.json $LAMBDA_TASK_ROOT
3 | RUN npm install
4 | CMD [ "app.lambdaHandler" ]


--------------------------------------------------------------------------------
/aws-lambda/GravitonLambdaNumber/src/app.js:
--------------------------------------------------------------------------------
 1 | const axios = require('axios');
 2 | 
 3 | exports.lambdaHandler = async (event, context) => {
 4 |     console.log('Running On: ',process.arch );
 5 |     console.log('Event: ',event );
 6 |     const body = JSON.parse(event.body);
 7 |     let res = await axios.get('http://numbersapi.com/'+ body.number + '/' + body.type);
 8 |     console.log('res: ',res );
 9 |     const responsebody = 'Running on:' + process.arch + ' - ' + res.data
10 | 
11 |     const response = {
12 |         "isBase64Encoded": false,
13 |         "headers": {
14 |             "Content-Type": "application/json",
15 |         },
16 |         "statusCode": 200,
17 |         "body": responsebody
18 |     };
19 |     return response;
20 | };


--------------------------------------------------------------------------------
/aws-lambda/GravitonLambdaNumber/src/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "webtest",
 3 |   "version": "1.0.0",
 4 |   "description": "",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "test": "echo \"Error: no test specified\" && exit 1"
 8 |   },
 9 |   "author": "",
10 |   "license": "ISC",
11 |   "dependencies": {
12 |     "axios": "^1.6.5"
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/aws-lambda/GravitonLambdaNumber/template.yml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | 
 3 | Transform:
 4 | - AWS::Serverless-2016-10-31
 5 | 
 6 | Resources:
 7 |   HttpApi:
 8 |     Type: AWS::Serverless::HttpApi
 9 |   LambdaNumberFunction:
10 |     Type: AWS::Serverless::Function
11 |     Properties:
12 |       CodeUri: src/
13 |       Handler: app.lambdaHandler
14 |       Runtime: nodejs20.x
15 |       Architectures:
16 |         - arm64
17 |       MemorySize: 128
18 |       Timeout: 100
19 |       Description: A Lambda function that returns a static string.           
20 |       Events:
21 |         Proxy:
22 |           Type: HttpApi
23 |           Properties:
24 |             Path: /
25 |             Method: post
26 |             ApiId: !Ref HttpApi      
27 | 
28 | Outputs:
29 |   ApiBasePath:
30 |     Description: "API Gateway endpoint URL"
31 |     Value: !Sub "https://${HttpApi}.execute-api.${AWS::Region}.amazonaws.com" 


--------------------------------------------------------------------------------
/aws-lambda/GravitonLambdaNumber/test/event.json:
--------------------------------------------------------------------------------
1 | {
2 |     "version":"2.0",
3 |     "routeKey":"POST /",
4 |     "rawPath":"/",
5 |     "rawQueryString":"",
6 |     "body":"{ \"number\": \"234\", \"type\": \"date\" }",
7 |     "isBase64Encoded":false
8 |  }


--------------------------------------------------------------------------------
/aws-lambda/PythonPrime/samconfig.toml:
--------------------------------------------------------------------------------
 1 | version = 0.1
 2 | [default]
 3 | [default.deploy]
 4 | [default.deploy.parameters]
 5 | stack_name = "PythonPrime"
 6 | s3_bucket = "aws-sam-cli-managed-default-samclisourcebucket-2gz7xh5d8tdn"
 7 | s3_prefix = "PythonPrime"
 8 | region = "us-east-1"
 9 | capabilities = "CAPABILITY_IAM"
10 | image_repositories = []
11 | 


--------------------------------------------------------------------------------
/aws-lambda/PythonPrime/src/app.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import math
 3 | import platform
 4 | import timeit
 5 | def primes_up_to(n):
 6 |     primes = []
 7 |     for i in range(2, n+1):
 8 |         is_prime = True
 9 |         sqrt_i = math.isqrt(i)
10 |         for p in primes:
11 |             if p > sqrt_i:
12 |                 break
13 |             if i % p == 0:
14 |                 is_prime = False
15 |                 break
16 |         if is_prime:
17 |             primes.append(i)
18 |     return primes
19 | 
20 | def lambda_handler(event, context):
21 |     print(event)
22 |     start_time = timeit.default_timer()
23 |     N = int(event['queryStringParameters']['max'])
24 |     primes = primes_up_to(N)
25 |     stop_time = timeit.default_timer()
26 |     elapsed_time = stop_time - start_time
27 | 
28 |     response = {
29 |         'machine': platform.machine(),
30 |         'elapsed': elapsed_time,
31 |         'message': 'There are {} prime numbers <= {}'.format(len(primes), N)
32 |     }
33 |     
34 |     return {
35 |         'statusCode': 200,
36 |         'body': json.dumps(response)
37 |     }


--------------------------------------------------------------------------------
/aws-lambda/PythonPrime/template.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Transform: AWS::Serverless-2016-10-31
 3 | Description: Graviton2 demo to show how to compare x86 and arm64 Lamda functions using AWS Lambda Power Tuning
 4 | Globals:
 5 |   Function:
 6 |     Timeout: 30
 7 | Resources:
 8 |   PythonPrimeArmFunction:
 9 |     Type: AWS::Serverless::Function
10 |     Properties:
11 |       CodeUri: src/
12 |       Handler: app.lambda_handler
13 |       Runtime: python3.9
14 |       Architectures:
15 |         - arm64
16 |   PythonPrimeX86Function:
17 |     Type: AWS::Serverless::Function 
18 |     Properties:
19 |       CodeUri: src/
20 |       Handler: app.lambda_handler
21 |       Runtime: python3.9
22 |       Architectures:
23 |         - x86_64
24 | 
25 | Outputs:
26 |   PythonPrimeX86Function:
27 |     Description: "Python Prime Lambda Function ARN"
28 |     Value: !GetAtt PythonPrimeX86Function.Arn
29 |   PythonPrimeArmFunction:
30 |     Description: "Python Prime Lambda Function ARN"
31 |     Value: !GetAtt PythonPrimeArmFunction.Arn


--------------------------------------------------------------------------------
/aws-lambda/img/ApiBasePath.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/aws-lambda/img/ApiBasePath.png


--------------------------------------------------------------------------------
/aws-lambda/img/LambdaGraviton.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/aws-lambda/img/LambdaGraviton.png


--------------------------------------------------------------------------------
/aws-lambda/img/createfunctionfromimage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/aws-lambda/img/createfunctionfromimage.png


--------------------------------------------------------------------------------
/aws-lambda/img/curlarm64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/aws-lambda/img/curlarm64.png


--------------------------------------------------------------------------------
/aws-lambda/img/curlx86.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/aws-lambda/img/curlx86.png


--------------------------------------------------------------------------------
/aws-lambda/img/dockerbuild.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/aws-lambda/img/dockerbuild.png


--------------------------------------------------------------------------------
/aws-lambda/img/dockerinspect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/aws-lambda/img/dockerinspect.png


--------------------------------------------------------------------------------
/aws-lambda/img/dockerrun.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/aws-lambda/img/dockerrun.png


--------------------------------------------------------------------------------
/aws-lambda/img/dockerrunresponse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/aws-lambda/img/dockerrunresponse.png


--------------------------------------------------------------------------------
/aws-lambda/img/powertuningcompare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/aws-lambda/img/powertuningcompare.png


--------------------------------------------------------------------------------
/aws-lambda/img/powertuningcomparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/aws-lambda/img/powertuningcomparison.png


--------------------------------------------------------------------------------
/aws-lambda/img/powertuningstatemachine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/aws-lambda/img/powertuningstatemachine.png


--------------------------------------------------------------------------------
/aws-lambda/img/powertuningx86results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/aws-lambda/img/powertuningx86results.png


--------------------------------------------------------------------------------
/aws-lambda/img/primefunctions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/aws-lambda/img/primefunctions.png


--------------------------------------------------------------------------------
/aws-lambda/img/sambuild.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/aws-lambda/img/sambuild.png


--------------------------------------------------------------------------------
/aws-lambda/img/sambuildcontainer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/aws-lambda/img/sambuildcontainer.png


--------------------------------------------------------------------------------
/aws-lambda/img/samdeploy-g.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/aws-lambda/img/samdeploy-g.png


--------------------------------------------------------------------------------
/aws-lambda/img/samlocalinvoke.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/aws-lambda/img/samlocalinvoke.png


--------------------------------------------------------------------------------
/dotnet.md:
--------------------------------------------------------------------------------
 1 | # .NET on Graviton
 2 | .NET is an open-source platform for writing different types of applications. Software engineers can write .NET based applications in multiple languages such as C#, F#, and Visual Basic. .NET applications are compiled into Common Intermediate Language (CIL). When an application is executed, the Common Language Runtime (CLR) loads that application binary and uses a just-in-time (JIT) compiler to generate machine code for the architecture being executed on. For more information, please see [what is .NET](https://dotnet.microsoft.com/learn/dotnet/what-is-dotnet).
 3 | 
 4 | 
 5 | ## .NET Versions
 6 | 
 7 | Version            | Linux Arm32   | Linux Arm64   | Notes
 8 | ------------------|-----------|-----------|-------------
 9 | .NET 9 | Yes | Yes | v9.0.0 released November 12, 2024 with Arm64 Linux builds. See also [Arm64 vectorization in .NET libraries](https://learn.microsoft.com/en-us/dotnet/core/whats-new/dotnet-9/runtime#arm64-vectorization-in-net-libraries).
10 | .NET 8 | Yes | Yes | v8.0.0 released November 14, 2023 with Arm64 Linux builds. See also [Arm64 Performance Improvements in .NET 8](https://devblogs.microsoft.com/dotnet/this-arm64-performance-in-dotnet-8/). For details on .NET 8 and Graviton, check out this blog: [Powering .NET 8 with AWS Graviton3: Benchmarks](https://aws.amazon.com/blogs/dotnet/powering-net-8-with-aws-graviton3-benchmarks/)
11 | .NET 7 | Yes | Yes | v7.0.0 released November 8, 2022 with Arm64 Linux builds. For more details check out this video: [Boosting .NET application performance with Arm64 and AWS Graviton 3](https://www.youtube.com/watch?v=V4Lxs5TbaFk) Note that .NET 7 is [out of support](https://dotnet.microsoft.com/en-us/platform/support/policy/dotnet-core#lifecycle). 
12 | [.NET 6](https://dotnet.microsoft.com/download/dotnet/6.0) | Yes | Yes |  V6.0.0 released November 8, 2021 with Arm64 Linux builds. For more details check out this blog: [.NET 6 on AWS](https://aws.amazon.com/blogs/developer/net-6-on-aws/) and video: [AWS re:Invent 2021 - Accelerate .NET 6 performance with Arm64 on AWS Graviton2](https://www.youtube.com/watch?v=iMlyZI9NhFw)
13 | [.NET 5](https://dotnet.microsoft.com/download/dotnet/5.0) | Yes | Yes | Arm64-specific optimizations in the .NET libraries and the code produced by RyuJIT. [Arm64 Performance in .NET 5](https://devblogs.microsoft.com/dotnet/arm64-performance-in-net-5/). Note that .NET 5 is [out of support](https://dotnet.microsoft.com/en-us/platform/support/policy/dotnet-core#lifecycle). 
14 | [.NET Framework 4.x](https://dotnet.microsoft.com/learn/dotnet/what-is-dotnet-framework) | No | No | The original implementation of the .NET Framework does not support Linux hosts, and Windows hosts are not suported on Graviton. 
15 | [.NET Core 3.1](https://dotnet.microsoft.com/download/dotnet/3.1) | Yes | Yes | .NET Core 3.0 added support for [Arm64 for Linux](https://docs.microsoft.com/en-us/dotnet/core/whats-new/dotnet-core-3-0#linux-improvements). Note that .NET Core 3.1 is [out of support](https://dotnet.microsoft.com/en-us/platform/support/policy/dotnet-core#lifecycle).
16 | [.NET Core 2.1](https://dotnet.microsoft.com/download/dotnet/2.1) | Yes* | No | Initial support was for [Arm32 was added to .NET Core 2.1](https://github.com/dotnet/announcements/issues/82). *Operating system support is limited, please see the [official documentation](https://github.com/dotnet/core/blob/main/release-notes/2.1/2.1-supported-os.md). Note that .NET Core 2.1 is [out of support](https://dotnet.microsoft.com/en-us/platform/support/policy/dotnet-core#lifecycle).
17 | 
18 | 
19 | ## .NET 5
20 | With .NET 5 Microsoft has made specific Arm64 architecture optimizations. These optimizations were made in the .NET libraries as well as in the machine code output by the JIT process.
21 | 
22 |  * AWS DevOps Blog [Build and Deploy .NET web applications to ARM-powered AWS Graviton 2 Amazon ECS Clusters using AWS CDK](https://aws.amazon.com/blogs/devops/build-and-deploy-net-web-applications-to-arm-powered-aws-graviton-2-amazon-ecs-clusters-using-aws-cdk/)
23 |  * AWS Compute Blog [Powering .NET 5 with AWS Graviton2: Benchmarks](https://aws.amazon.com/blogs/compute/powering-net-5-with-aws-graviton2-benchmark-results/) 
24 |  * Microsoft .NET Blog [ARM64 Performance in .NET 5](https://devblogs.microsoft.com/dotnet/arm64-performance-in-net-5/)
25 | 
26 | 
27 | ## Building & Publishing for Linux Arm64
28 | The .NET SDK supports choosing a [Runtime Identifier (RID)](https://docs.microsoft.com/en-us/dotnet/core/rid-catalog) used to target platforms where the applications run. These RIDs are used by .NET dependencies (NuGet packages) to represent platform-specific resources in NuGet packages. The following values are examples of RIDs: linux-arm64, linux-x64, ubuntu.14.04-x64, win7-x64, or osx.10.12-x64. For the NuGet packages with native dependencies, the RID designates on which platforms the package can be restored.
29 | 
30 | You can build and publish on any host operating system. As an example, you can develop on Windows and build locally to target Arm64, or you can use a CI server like Jenkins on Linux. The commands are the same.
31 | 
32 | ```bash
33 | dotnet build -r linux-arm64
34 | dotnet publish -c Release -r linux-arm64
35 | ```
36 | 
37 | For more information about [publishing .NET apps with the .NET CLI](https://docs.microsoft.com/en-us/dotnet/core/deploying/deploy-with-cli) please see the offical documents.
38 | 


--------------------------------------------------------------------------------
/dpdk_spdk.md:
--------------------------------------------------------------------------------
 1 | # DPDK, SPDK, ISA-L supports Graviton
 2 | 
 3 | Graviton2 and later CPUs are optimized for data path functions like networking and storage.  Users of [DPDK](https://github.com/dpdk/dpdk) and [SPDK](https://github.com/spdk/spdk) can download and compile natively on Graviton following the normal installation guidelines from the respective repositories linked above.
 4 | 
 5 | **NOTE**: *Though DPDK precompiled packages are available from Ubuntu but we recommend building them from source.*
 6 | 
 7 | SPDK relies often on [ISA-L](https://github.com/intel/isa-l) which is already optimized for Arm64 and the CPU cores in Graviton2 and later processors.
 8 | 
 9 | 
10 | 
11 | ## Compile DPDK from source
12 | 
13 | [DPDK official guidelines](https://doc.dpdk.org/guides/linux_gsg/build_dpdk.html) requires using *meson* and *ninja* to build from source code.
14 | 
15 | A native compilation of DPDK on top of Graviton will generate optimized code that take advantage of the CRC and Crypto instructions in Graviton2 and later cpu cores.
16 | 
17 | **NOTE**: Some of the installations steps call "python" which may not be valid command in modern linux distribution,  you may need to install *python-is-python3* to resolve this.
18 | 
19 | ### Older DPDK version with makefile-based build
20 | 
21 | If a developer is using the makefile-based build (vs the newer *meson*), the following [patch](https://www.mail-archive.com/dev@dpdk.org/msg179445.html) will enable a Graviton2 optimized build.
22 | 
23 | 
24 | ## Performance consideration
25 | 
26 | ### Optimal RTE settings
27 | 
28 | In some older releases (prior to DPDK 20.11), some default parameters are not optimal and developers should check the values:
29 | * RTE_MAX_LCORE, should at least be 64
30 | * RTE_CACHE_LINE_SIZE=64
31 | 
32 | ### Number of LCores used could be misconfigured
33 | 
34 | Some application, written with the x86 architecture in mind, set the active dpdk threads or lcores to 1/2 number of vCPU to run single thread per physical core on x86 and disabling SMT.  However, in Graviton, every vCPU is a full CPU, and a developer can use more threads or lcores than same size x86-based instance.   For example, a c5.16xl has 64vCPU or 32 physical cores,  but some DPDK application would only run on 32 to guarantee one thread per physical core.   In c6g.16xl, developer can use 64 physical cores.
35 | 
36 | ## Known issues
37 | 
38 | * **testpmd:** The flowgen function of testpmd does not work correctly when compiled with GCC 9 and above. It generates IP packets with wrong checksum which are dropped when transmitted between AWS instances (including Graviton). This is a known issue and there is a [patch](https://patches.dpdk.org/patch/84772/) that fixes it.
39 | 
40 | 


--------------------------------------------------------------------------------
/golang.md:
--------------------------------------------------------------------------------
 1 | # Go on Graviton
 2 | 
 3 | Go is a statically typed, compiled programming language originally designed at Google. Go supports arm64 out of the box, and available in all common distributions, with recent changes that improve performance, so make sure to use the latest version of the Go compiler and toolchain.
 4 | 
 5 | ## Noteworthy performance upgrades
 6 | ### Go 1.18 \[released 2022/03/14\]
 7 | The main implementation of the Go compiler, [golang/go](https://github.com/golang/go), has improved
 8 | performance on Arm by implementing a new way of passing function arguments and results using registers instead of the stack. This change has been available on x86-64 since 1.17, where it brought performance improvements of about 5%. On Arm this change typically gives even higher performance improvements of 10% or more.
 9 | 
10 | To learn more about the use cases benefiting from Go 1.18's performance improvements, check the blog post: [Making your Go workloads up to 20% faster with Go 1.18 and AWS Graviton](https://aws.amazon.com/blogs/compute/making-your-go-workloads-up-to-20-faster-with-go-1-18-and-aws-graviton/).
11 | 
12 | ### Go 1.17 \[released 2021/08/16\]
13 | The main implementation of the Go compiler, [golang/go](https://github.com/golang/go), has improved
14 | performance for the following standard library packages:
15 | 
16 | - crypto/ed25519 - the package has been rewritten, and all operations are now approximately twice as fast on both arm64 and amd64.
17 | - crypto/elliptic - CurveParams methods now automatically invoke faster and safer dedicated implementations for known curves (P-224, P-256, and P-521) when available. The P521 curve implementation has also been rewritten and is now constant-time and three times faster on amd64 and arm64.
18 | 
19 | 
20 | ### Go 1.16 \[released 2021/02/16\]
21 | The main implementation of the Go compiler, [golang/go](https://github.com/golang/go), has improved
22 | performance on Arm with couple of changes listed below. Building your project with Go 1.16 will give you these improvements:
23 | 
24 |  * [ARMv8.1-A Atomics instructions](https://go-review.googlesource.com/c/go/+/234217), which dramatically improve mutex fairness and speed on Graviton 2, and modern Arm core with v8.1 and newer instruction set.
25 |  * [copy performance improvements](https://go-review.googlesource.com/c/go/+/243357), especially when the addresses are unaligned.
26 | 
27 | ### Recently updated packages
28 | Changes to commonly used packages that improve performance on Arm can make a noticeable difference in
29 | some cases. Here is a partial list of packages to be aware of.
30 | 
31 | Package   | Version   | Improvements
32 | ----------|-----------|-------------
33 | [Snappy](https://github.com/golang/snappy) | as of commit [196ae77](https://github.com/golang/snappy/commit/196ae77b8a26000fa30caa8b2b541e09674dbc43) | assembly implementations of the hot path functions were ported from amd64 to arm64
34 | 
35 | ## Using Go in a Container with CPU Limits
36 | 
37 | Go automatically assigns a sensible value to `GOMAXPROCS` based on the number of
38 | CPU cores available. However, using a container with a limitation on how much
39 | CPU is available to that container can lead to problems. For example, using the
40 | [CFS scheduler](https://docs.docker.com/engine/containers/resource_constraints/#configure-the-default-cfs-scheduler)
41 | option in Docker, `--cpus=1` can limit the available CPU time to the
42 | equivalent of 1 CPU while still exposing all of the actually available CPUs to
43 | the container. If you use CPU limits in this way, it may make sense to also
44 | manually set `GOMAXPROCS` to an equivalent value.
45 | 


--------------------------------------------------------------------------------
/howtoresources.md:
--------------------------------------------------------------------------------
 1 | # How to Resources
 2 | 
 3 | If you are just getting started with AWS Graviton-based instances, or even if you have been using AWS Graviton for some time, below is a list of some of the tech postings from around the AWS community. The list includes blog postings, tech talks and some recent re:Invent sessions as well. It also covers topics including running applications on AWS, AI/ML, cost optimization, sustainability, and HPC.
 4 | 
 5 | ### Recent Events
 6 | 
 7 | #### [AWS re:Invent 2023](https://reinvent.awsevents.com/)
 8 |   * [CMP404: Migrating to AWS Graviton with AWS container services](https://www.youtube.com/watch?v=9JZVomrx6uQ)
 9 |   * [CMP315: Migrating critical business applications to AWS Graviton with ease](https://www.youtube.com/watch?v=9W0j__k5afg)
10 |   * [CMP334: The best price performance for your AWS workloads (Graviton4 deep dive)](https://www.youtube.com/watch?v=T_hMIjKtSr4)
11 |   
12 | ### Development Tools
13 | 
14 | * Performance Analysis - [APerf](https://github.com/aws/aperf)
15 | * Source Code Analysis - [Porting Advisor for Graviton](https://github.com/aws/porting-advisor-for-graviton)
16 | 
17 | ### Building / Running applications on AWS Graviton
18 | 
19 | * Tech Talk - [AWS Graviton and EC2 - a bit of history](https://www.youtube.com/watch?v=yAf6-A8Zso4)
20 | * Blog - [Multi-Architecture Container Builds with CodeCatalyst](https://aws.amazon.com/blogs/devops/multi-architecture-container-builds-with-codecatalyst/)
21 | * Tech Talk - [Package management with Graviton](https://www.youtube.com/watch?v=ysmvoO4DgB8)
22 | * Tech Talk - [AMIs for Graviton - Operating systems supporting Graviton, how to make the best choice](https://youtu.be/mzDlszhJetI)
23 | * Twitch - [How to migrate your first application to Graviton](https://www.linkedin.com/video/live/urn:li:ugcPost:7036455289579077633/)
24 | * Blog - [Unlock the power of EC2 Graviton with GitLab CI/CD and EKS Runners](https://aws.amazon.com/cn/blogs/devops/unlock-the-power-of-ec2-graviton-with-gitlab-ci-cd-and-eks-runners/)
25 | * Twitch - [Measuring Performance - How to measure performance on Graviton based instances](https://www.twitch.tv/videos/1876233449?collection=ZbDCkIlpDhemjg)
26 | 
27 | ### Cost Optimization
28 | 
29 | * Virtual Workshop - [Optimizing Cost with AWS Graviton Based Services](https://www.youtube.com/watch?v=BfiEEx8k1lQ)
30 | * Blog - [Optimize AWS costs without architectural changes or engineering overhead](https://aws.amazon.com/blogs/aws-cloud-financial-management/optimize-aws-costs-without-architectural-changes-or-engineering-overhead/)
31 | * Tech Talk - [Optimize your workloads, no architectural changes needed](https://pages.awscloud.com/Optimize-your-workloads-no-architectural-changes-needed_2023_0301-OTT-OD-NGI_OD)
32 | 
33 | ### Sustainability
34 | 
35 | * Tech Talk - [Building Sustainable Infrastructure with EC2 Graviton](https://www.youtube.com/watch?v=TmHIROOQ1Mc)
36 | * Tech Talk - [How to optimize workloads for sustainability using AWS Graviton-based EC2 instances](https://www.youtube.com/watch?v=pzSvcsduijM)
37 | 
38 | ### Programming Languages
39 | 
40 | * Rust - Blog - [Building Rust Applications For AWS Graviton](https://community.aws/tutorials/building-rust-applications-for-aws-graviton)
41 | * .Net - Blog - [.NET Workflows for arm64 with Amazon CodeCatalyst: Part 1](https://aws.amazon.com/blogs/dotnet/net-workflows-for-arm64-with-codecatalyst-part-1/)
42 | * .Net - Blog - [.NET Workflows for arm64 with Amazon CodeCatalyst: Part 2](https://aws.amazon.com/blogs/dotnet/net-workflows-for-arm64-with-codecatalyst-part-2/)
43 | * Java - Tech Talk - [Java on Graviton - How to use Corretto](https://www.youtube.com/watch?v=zANOBN4jZfI)
44 | * Python - Twitch - [How to develop and build a python module with native extension for a multi-architecture deployment](https://www.twitch.tv/videos/1888177585?collection=ZbDCkIlpDhemjg)
45 | 
46 | ### HPC
47 | 
48 | * Blog - [Application deep-dive into the AWS Graviton3E-based Amazon EC2 Hpc7g instance](https://aws.amazon.com/blogs/hpc/application-deep-dive-into-the-graviton3e-based-amazon-ec2-hpc7g-instance/)
49 | 
50 | ### AI/ML
51 | 
52 | * Blog - [Optimized PyTorch 2.0 inference with AWS Graviton processors](https://aws.amazon.com/blogs/machine-learning/optimized-pytorch-2-0-inference-with-aws-graviton-processors/)
53 | * Blog - [Reduce Amazon SageMaker inference cost with AWS Graviton](https://aws.amazon.com/blogs/machine-learning/reduce-amazon-sagemaker-inference-cost-with-aws-graviton/)
54 | * Tech Talk - [Deep Dive: PyTorch 2.0 on AWS Graviton](https://www.youtube.com/watch?v=c1Rl-vCmnT0)
55 | * Blog - [Implementing a Full ML Lifecycle with Anaconda Distribution on AWS Graviton](https://www.anaconda.com/blog/implementing-a-full-ml-lifecycle-with-anaconda-distribution-on-aws-graviton)
56 | 
57 | ### Porting Advisor for AWS Graviton
58 | 
59 | * Twitch - [Porting Advisor for Graviton](https://www.twitch.tv/videos/1822190104)
60 | * Blog -  [Using Porting Advisor for Graviton](https://aws.amazon.com/blogs/compute/using-porting-advisor-for-graviton/)
61 | 
62 | ### Databases
63 | 
64 | * Blog - [Choose AWS Graviton and cloud storage for your Ethereum nodes infrastructure on AWS](https://aws.amazon.com/blogs/database/choose-aws-graviton-and-cloud-storage-for-your-ethereum-nodes-infrastructure-on-aws/)
65 | 
66 | ### Customer Stories
67 | 
68 | * Datadog - [What to know before adopting Arm: Lessons learned at Datadog](https://www.youtube.com/watch?v=bbchHOFVUuY)
69 | * Aerospike - [Building real-time applications to utilize AWS Graviton](https://www.youtube.com/watch?v=-9ul3j-fBpU)
70 | * Tehama - [Tehama leverages Graviton cost efficiency to strengthen business core competency](https://aws.amazon.com/blogs/industries/tehama-leverages-graviton-cost-efficiency-to-strengthen-business-core-competency/)
71 | * Zomato - [How Zomato Boosted Performance 25% and Cut Compute Cost 30% Migrating Trino and Druid Workloads to AWS Graviton](https://aws.amazon.com/blogs/opensource/how-zomato-boosted-performance-25-and-cut-compute-cost-30-migrating-trino-and-druid-workloads-to-aws-graviton/)
72 | * Stripe - [AWS Graviton deep dive: The best price performance for AWS workloads](https://youtu.be/lZkO-KelLnk?t=1858)
73 | * Snowflake - [How Snowflake Optimized its Virtual Warehouses for Sustainability Using AWS Graviton](https://aws.amazon.com/blogs/apn/how-snowflake-optimized-its-virtual-warehouses-for-sustainability-using-aws-graviton/)
74 | 


--------------------------------------------------------------------------------
/machinelearning/onnx.md:
--------------------------------------------------------------------------------
  1 | # ML inference on Graviton CPUs with Open Neural Network Exchange(ONNX)
  2 | 
  3 | **Introduction**
  4 | 
  5 | ONNX is an open-source machine learning framework that provides interoperability between different frameworks. ONNX Runtime is the runtime engine used for model inference and training with ONNX . This document covers how to use ONNX based machine learning inference on Graviton CPUs, what runtime configurations are important and how to run benchmarking scripts. The document also covers instructions for source builds to enable experimenting with downstream features.
  6 | 
  7 | # How to use ONNX on Graviton CPUs
  8 | 
  9 | Python wheels is the recommended option to use ONNX on Graviton, the default backend is optimized for Graviton CPU.
 10 | 
 11 | ```
 12 | # Upgrade pip3 to the latest version
 13 | python3 -m pip install --upgrade pip
 14 | 
 15 | # Install ONNX and ONNX Runtime
 16 | python3 -m pip install onnx
 17 | python3 -m pip install onnxruntime
 18 | ```
 19 | 
 20 | # Prerequisites
 21 | 
 22 | 1. It is highly recommended to use the AMIs based on Linux Kernel 5.10 and beyond for the best ONNX inference performance on Graviton3 instances. The following queries can be used to list down the AMIs with Kernel 5.10 and beyond.
 23 | 
 24 | ```
 25 | # For Kernel 5.10 based AMIs list
 26 | aws ec2 describe-images --owners amazon --filters "Name=architecture,Values=arm64" "Name=name,Values=*kernel-5.10*" --query 'sort_by(Images, &CreationDate)[].Name'
 27 | 
 28 | # For Kernel 6.x based AMIs list
 29 | aws ec2 describe-images --owners amazon --filters "Name=architecture,Values=arm64" "Name=name,Values=*kernel-6.*" --query 'sort_by(Images, &CreationDate)[].Name'
 30 | ```
 31 | 
 32 | # Runtime configurations for optimal performance
 33 | 
 34 | Graviton3(E) (e.g. c7g/m7g/r7g, c7gn and Hpc7g instances) supports BFloat16 format and advanced Matrix Multiplication (MMLA) instructions for ML acceleration. Starting version v1.17.0, ONNX Runtime supports Bfloat16 accelerated SGEMM kernels and INT8 MMLA accelerated Quantized GEMM (QGEMM) kernels on Graviton3(E) CPU.
 35 | 
 36 | Note: The standard FP32 model inference can be accelerated with BFloat16 SGEMM kernels without model quantization.
 37 | 
 38 | MMLA QGEMM kernels are enabled by default, and to enable BF16 acceleration, set the onnxruntime session option as shown below
 39 | 
 40 | ```c++
 41 | # For C++ applications
 42 | SessionOptions so;
 43 | so.config_options.AddConfigEntry(
 44 |       kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16, "1");
 45 | ```
 46 | 
 47 | ```python
 48 | # For Python applications
 49 | sess_options = onnxruntime.SessionOptions()
 50 | sess_options.add_session_config_entry("mlas.enable_gemm_fastmath_arm64_bfloat16", "1")
 51 | ```
 52 | 
 53 | # Evaluate performance with ONNX Runtime benchmark
 54 | ONNX Runtime repo provides inference benchmarking scripts for transformers based language models. The scripts support a wide range of models, frameworks and formats. The following section explains how to run BERT, RoBERTa and GPT model inference in fp32 and int8 quantized formats. Refer to [ONNX Runtime Benchmarking script](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/transformers/benchmark.py) for more details.
 55 | 
 56 | 
 57 | ```
 58 | # Install onnx and onnx runtime
 59 | python3 -m pip install onnx onnxruntime
 60 | 
 61 | # Install the dependencies
 62 | python3 -m pip install transformers torch psutil
 63 | 
 64 | # Clone onnxruntime repo to get the benchmarking scripts
 65 | git clone --recursive https://github.com/microsoft/onnxruntime.git
 66 | cd onnxruntime/onnxruntime/python/tools/transformers
 67 | 
 68 | # The scripts download the models, export them to onnx format,
 69 | # quantize into int8 for int8 inference, run inference for
 70 | # different sequence lengths and batch sizes. Upon successful run,
 71 | # the scripts print the inference throughput in QPS (Queries/sec)
 72 | # and latency in msec along with system configuration
 73 | 
 74 | # Next run the benchmarks, select fp32 or int8 precision via -p argument
 75 | # To run bert-large
 76 | python3 benchmark.py -m bert-large-uncased -p <fp32/int8>
 77 | 
 78 | # To run bert-base
 79 | python3 benchmark.py -m bert-base-cased -p <fp32/int8>
 80 | 
 81 | # To run roberta-base
 82 | python3 benchmark.py -m roberta-base -p <fp32/int8>
 83 | 
 84 | # To run gpt2
 85 | python3 benchmark.py -m gpt2 -p <fp32/int8>
 86 | 
 87 | ```
 88 | 
 89 | # Building ONNX RunTime from source
 90 | 
 91 | We recommend using the official python wheel distribution, but there are cases developers may want to compile ONNX Runtime from source. One such case would be to experiment with new features or develop custom features. This section outlines the recommended way to compile ONNX Runtime from source.
 92 | 
 93 | ```
 94 | # Clone onnxruntime
 95 | git clone --recursive https://github.com/Microsoft/onnxruntime.git
 96 | cd onnxruntime
 97 | 
 98 | # Install cmake-3.27 or higher, one option is via pip installer.
 99 | python3 -m pip install cmake
100 | 
101 | # Build python wheel with Release configuration
102 | ./build.sh --config=Release --build_shared_lib --build_wheel --parallel 16
103 | 
104 | # the wheel is copied to build/Linux/Release/dist folder, so, to install
105 | pip3 install <build/Linux/Release/dist/onnxruntime_dnnl*.whl>
106 | ```
107 | 


--------------------------------------------------------------------------------
/machinelearning/vllm.md:
--------------------------------------------------------------------------------
 1 | # Large Language Model (LLM) inference on Graviton CPUs with vLLM
 2 | 
 3 | **Introduction**
 4 | 
 5 | vLLM is a fast and easy-to-use library for LLM inference and serving. It provides an OpenAI-compatible API server and support NVIDIA GPUs, CPUs and AWS Neuron. vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform. ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
 6 | This document covers how to build and run vLLM for LLM inference on AWS Graviton based Amazon EC2 Instances. 
 7 | 
 8 | # How to use vLLM on Graviton CPUs
 9 | 
10 | There are no pre-built wheels or images for Graviton CPUs, so you must build vLLM from source.
11 | 
12 | **Prerequisites**
13 | 
14 | Graviton3(E) (e.g. *7g instances) and Graviton4 (e.g. *8g instances) CPUs support BFloat16 format and MMLA instructions for machine learning (ML) acceleration. These hardware features are enabled starting with Linux Kernel version 5.10. So, it is highly recommended to use the AMIs based on Linux Kernel 5.10 and beyond for the best LLM inference performance on Graviton Instances. Use the following queries to list the AMIs with the recommended Kernel versions. New Ubuntu 22.04, 24.04, and AL2023 AMIs all have kernels newer than 5.10.
15 | 
16 | The following steps were tested on a Graviton3 R7g.4xlarge and Ubuntu 24.04.1
17 | 
18 | ```
19 | # For Kernel 5.10 based AMIs list
20 | aws ec2 describe-images --owners amazon --filters "Name=architecture,Values=arm64" "Name=name,Values=*kernel-5.10*" --query 'sort_by(Images, &CreationDate)[].Name'
21 | 
22 | # For Kernel 6.x based AMIs list
23 | aws ec2 describe-images --owners amazon --filters "Name=architecture,Values=arm64" "Name=name,Values=*kernel-6.*" --query 'sort_by(Images, &CreationDate)[].Name'
24 | ```
25 | 
26 | **Install Compiler and Python packages**
27 | ```
28 | sudo apt-get update  -y
29 | sudo apt-get install -y gcc-13 g++-13 libnuma-dev python3-dev python3-virtualenv
30 | ```
31 | 
32 | **Create a new Python environment**
33 | ```
34 | virtualenv venv
35 | source venv/bin/activate
36 | ```
37 | 
38 | **Clone vLLM project**
39 | ```
40 | git clone https://github.com/vllm-project/vllm.git
41 | cd vllm
42 | ```
43 | 
44 | **Install Python Packages and build vLLM CPU Backend**
45 | 
46 | ```
47 | pip install --upgrade pip
48 | pip install "cmake>=3.26" wheel packaging ninja "setuptools-scm>=8" numpy
49 | pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
50 | 
51 | VLLM_TARGET_DEVICE=cpu python setup.py install
52 | ```
53 | 
54 | **Run DeepSeek Inference on AWS Graviton**
55 | 
56 | ```
57 | export VLLM_CPU_KVCACHE_SPACE=40
58 | 
59 | vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
60 | 
61 | curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{ "model": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Why is the sky blue?"}],"max_tokens": 100 }'
62 | ```
63 | 
64 | Sample output is as below.
65 | 
66 | ```
67 | {"id":"chatcmpl-4c95b14ede764ab4a1338b0670ea839a","object":"chat.completion","created":1741351310,"model":"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B","choices":[{"index":0,"message":{"role":"assistant","reasoning_content":null,"content":"Okay, so I'm trying to understand why the sky appears blue. I've heard this phenomenon before, but I'm not exactly sure how it works. I think it has something to do with the mixing of light and particles in the atmosphere, but I'm not entirely clear on the details. Let me try to break it down step by step.\n\nFirst, there are a few factors contributing to why the sky looks blue. From what I remember, the atmosphere consists of gases and particles that absorb and refract light. Different parts of the sky are observed through different atmospheric layers, which might explain why the colors vary over long distances.\n\nI think the primary reason is that the atmosphere absorbs some of the red and blue light scattered from the sun. Red light has a longer wavelength compared to blue, so it is absorbed more easily because the molecules in the atmosphere absorb light based on its wavelength. Blue light has a shorter wavelength and doesn't get absorbed as much. As a result, the sky remains relatively blue because the blue light that hasn't been absorbed is still passing through the atmosphere and is refracted.\n\nAnother factor to consider is the angle of observation. Because the atmosphere travels through the sky, the observation of blue light can be messy at altitudes where the atmosphere is thinner.","tool_calls":[]},"logprobs":null,"finish_reason":"length","stop_reason":null}],"usage":{"prompt_tokens":17,"total_tokens":273,"completion_tokens":256,"prompt_tokens_details":null},"prompt_logprobs":null}
68 | ```
69 | 
70 | # Additional Resources
71 | https://learn.arm.com/learning-paths/servers-and-cloud-computing/vllm/vllm-server/
72 | https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html?device=arm
73 | 


--------------------------------------------------------------------------------
/nodejs.md:
--------------------------------------------------------------------------------
 1 | # Node.js on Graviton
 2 | 
 3 | Graviton is an excellent choice for running web applications with Node.js. There
 4 | are a few considerations to be aware of to get the best performance.
 5 | 
 6 | ## Use Multiprocessing
 7 | 
 8 | Node.JS is fundamentally single threaded and so on an instance with more than
 9 | one vCPU (which is most of them!), the node process will leave the CPU
10 | underutilized. There a few ways to improve this. 
11 | 
12 | 1. Use a load balancer, such as Nginx, to balance incoming HTTP requests across
13 | multiple processes. 
14 | 1. Use a built in module, `cluster` to balance the load across several forks of
15 | the node process.
16 | 
17 | The details of how to do this is beyond the scope of this document, but can be
18 | easily found with a few quick searches of the web.
19 | 
20 | ## Use Statically Linked Builds
21 | 
22 | If you download compiled binaries of the Node from Nodejs.org, you will have
23 | statically linked binaries. Some package managers distribute Node as a thin
24 | `node` binary which is dynamically linked to `libnode.so` where most of the code
25 | lives. This is fine and allows other applications to link with `libnode.so`, but
26 | it adds a small amount of extra overhead in each function call since each one
27 | must use an extra step of indirection to load the destination function address.
28 | This hardly matters at all until your application reaches a threshold volume of
29 | incoming requests and it can no longer service all requests coming in. In a
30 | dynamically linked `node`, this threshold will be lower. This is true on on all
31 | EC2 instance types; it is not unique to Graviton.
32 | 
33 | ## Applications Using Many and Complex Regular Expressions
34 | 
35 | A shortcoming in the just in time compiler in V8 for aarch64 creates a long link
36 | chain of veeneers when evaluating complex regular expressions. A new version of
37 | V8 addresses this, but it has not yet been merged into NodeJS main. If your
38 | application relies heavily on regular expression performance AND you find that
39 | the performance is lower on Graviton, try adding `--regexp-interpret-all` to
40 | the node arguments to force V8 to interpret rather than compile regular
41 | expressions.
42 | 


--------------------------------------------------------------------------------
/optimizing.md:
--------------------------------------------------------------------------------
  1 | # Optimizing for Graviton
  2 | 
  3 | ## Debugging Problems
  4 | 
  5 | It's possible that incorrect code will work fine on an existing system, but
  6 | produce an incorrect result when using a new compiler. This could be because
  7 | it relies on undefined behavior in the language (e.g. assuming char is signed in C/C++,
  8 | or the behavior of signed integer overflow), contains memory management bugs that
  9 | happen to be exposed by aggressive compiler optimizations, or incorrect ordering.
 10 | Below are some techniques / tools we have used to find issues
 11 | while migrating our internal services to newer compilers and Graviton based instances.
 12 | 
 13 | ### Using Sanitizers
 14 | The compiler may generate code and layout data slightly differently on Graviton
 15 | compared to an x86 system and this could expose latent memory bugs that were previously
 16 | hidden. On GCC, the easiest way to look for these bugs is to compile with the
 17 | memory sanitizers by adding the below to standard compiler flags:
 18 | 
 19 | ```
 20 |     CFLAGS += -fsanitize=address -fsanitize=undefined
 21 |     LDFLAGS += -fsanitize=address  -fsanitize=undefined
 22 | ```
 23 | 
 24 | Then run the resulting binary, any bugs detected by the sanitizers will cause
 25 | the program to exit immediately and print helpful stack traces and other
 26 | information.
 27 | 
 28 | ### Ordering issues
 29 | Arm is weakly ordered, similar to POWER and other modern architectures. While
 30 | x86 is a variant of total-store-ordering (TSO).
 31 | Code that relies on TSO may lack barriers to properly order memory references.
 32 | Armv8 based systems, including all Gravitons are [weakly ordered
 33 | multi-copy-atomic](https://www.cl.cam.ac.uk/~pes20/armv8-mca/armv8-mca-draft.pdf).
 34 | 
 35 | While TSO allows reads to occur out-of-order with writes and a processor to
 36 | observe its own write before it is visible to others, the Armv8 memory model has
 37 | further relaxations for performance and power efficiency.
 38 | Code relying on pthread mutexes or locking abstractions
 39 | found in C++, Java or other languages shouldn't notice any difference. Code that
 40 | has a bespoke implementation of lockless data structures or implements its own
 41 | synchronization primitives will have to use the proper intrinsics and
 42 | barriers to correctly order memory transactions. If you run into an issue with
 43 | memory ordering please feel free to open an issue in this GitHub repo, and one
 44 | of our AWS experts will contact you.
 45 | 
 46 | ### Architecture specific optimization
 47 | Sometimes code will have architecture specific optimizations. These can take many forms:
 48 | sometimes the code is optimized in assembly using specific instructions for
 49 | [CRC](https://github.com/php/php-src/commit/2a535a9707c89502df8bc0bd785f2e9192929422),
 50 | other times the code could be enabling a [feature](https://github.com/lz4/lz4/commit/605d811e6cc94736dd609c644404dd24c013fd6f)
 51 | that has been shown to work well on particular architectures. A quick way to see if any optimizations
 52 | are missing for Arm is to grep the code for `__x86_64__` `ifdef`s and see if there
 53 | is corresponding Arm code there too. If not, that might be something to improve.
 54 | We welcome suggestions by opening an issue in this repo.
 55 | 
 56 | ### Lock/Synchronization intensive workload
 57 | Graviton2 processors and later support the Arm Large Scale Extensions (LSE). LSE based locking and synchronization
 58 | is an order of magnitude faster for highly contended locks with high core counts (e.g. up to 192 cores on Graviton4).
 59 | For workloads that have highly contended locks, compiling with `-march=armv8.2-a` will enable LSE based atomics and can substantially increase performance. However, this will prevent the code
 60 | from running on an Arm v8.0 system such as AWS Graviton-based EC2 A1 instances.
 61 | With GCC 10 and newer an option `-moutline-atomics` will not inline atomics and
 62 | detect at run time the correct type of atomic to use. This is slightly worse
 63 | performing than `-march=armv8.2-a` but does retain backwards compatibility.
 64 | 
 65 | ### Network intensive workloads
 66 | In some workloads, the packet processing capability of Graviton is both faster and
 67 | lower-latency than other platforms, which reduces the natural “coalescing”
 68 | capability of Linux kernel and increases the interrupt rate.
 69 | Depending on the workload it might make sense to enable adaptive RX interrupts
 70 | (e.g. `ethtool -C <interface> adaptive-rx on`).
 71 | 
 72 | ## Profiling the code
 73 | If you aren't getting the performance you expect, one of the best ways to understand what is
 74 | going on in the system is to compare profiles of execution and understand where the CPUs are
 75 | spending time. This will frequently point to a hot function or sub-system that could be optimized. A crutch
 76 | is comparing a profile between a system that is performing well and one that isn't to see the
 77 | relative difference in execution time. Feel free to open an issue in this
 78 | GitHub repo for advice or help.
 79 | 
 80 | Using [AWS APerf](https://github.com/aws/aperf) tool:
 81 | ```bash
 82 | # Graviton
 83 | wget -qO- https://github.com/aws/aperf/releases/download/v0.1.10-alpha/aperf-v0.1.10-alpha-aarch64.tar.gz | tar -xvz -C /target/directory
 84 | 
 85 | # x86
 86 | wget -qO- https://github.com/aws/aperf/releases/download/v0.1.10-alpha/aperf-v0.1.10-alpha-x86_64.tar.gz | tar -xvz -C /target/directory
 87 | 
 88 | ## Record a profile and generate a report
 89 | cd /target/directory/
 90 | ./aperf record -r <RUN_NAME> -i <INTERVAL_NUMBER> -p <COLLECTION_PERIOD>
 91 | ./aperf report -r <COLLECTOR_DIRECTORY> -n <REPORT_NAME>
 92 | 
 93 | ## The resulting report can be viewed with a web-browser by opening the index.html file
 94 | ```
 95 | 
 96 | 
 97 | Using the Linux perf tool:
 98 | ```bash
 99 | # Amazon Linux 2
100 | sudo yum install perf
101 | 
102 | # Ubuntu
103 | sudo apt-get install linux-tools-$(uname -r)
104 | ```
105 | 
106 | Record a profile:
107 | ```
108 | # If the program is run interactively
109 | $ sudo perf record -g -F99 -o perf.data ./your_program
110 | 
111 | # If the program is a service, sample all cpus (-a) and run for 60 seconds while the system is loaded
112 | $  sudo perf record -ag -F99 -o perf.data  sleep 60
113 | ```
114 | 
115 | Look at the profile:
116 | ```
117 | $ perf report
118 | ```
119 | 
120 | Additionally, there is a tool that will generate a visual representation of the output which can sometimes
121 | be more useful:
122 | ```
123 | git clone https://github.com/brendangregg/FlameGraph.git
124 | perf script -i perf.data | FlameGraph/stackcollapse-perf.pl | FlameGraph/flamegraph.pl > flamegraph.svg
125 | ```
126 | 
127 | For example, in March 2020, we committed a patch to
128 | [ffmpeg](http://ffmpeg.org/pipermail/ffmpeg-devel/2019-November/253385.html) to
129 | improve performance. Comparing the execution time of a C5 vs an M6g
130 | immediately uncovered an outlier function `ff_hscale_8_to_15_neon`.  Once we
131 | identified this as the outlier we could focus on improving this function.
132 | 
133 | ```
134 | C5.4XL	                        M6g.4XL
135 | 19.89% dv_encode_video_segment	19.57% ff_hscale_8_to_15_neon
136 | 11.21% decode_significance_x86	18.02% get_cabac
137 | 8.68% get_cabac	                15.08% dv_encode_video_segment
138 | 8.43% ff_h264_decode_mb_cabac	5.85% ff_jpeg_fdct_islow_8
139 | 8.05% ff_hscale8to15_X4_ssse3	5.01% ff_yuv2planeX_8_neon
140 | ```


--------------------------------------------------------------------------------
/os.md:
--------------------------------------------------------------------------------
 1 | # Operating Systems available for Graviton based instances
 2 | 
 3 |  Name | Version | [LSE Support](optimizing.md#locksynchronization-intensive-workload) | Kernel page size | AMI | Metal support | Comment
 4 | ------ | ------ | ----- | ----- | ----- | ----- | -----
 5 | Amazon Linux 2023 | All versions | Yes | 4KB | [AMIs](amis_cf_sm.md) | Yes | Pointer Authentication enabled on Graviton3
 6 | Amazon Linux 2 | 2.26-35 or later| Yes | 4KB | [AMIs](amis_cf_sm.md) | Yes | End of Life (EOL) scheduled 2025-06-30
 7 | Ubuntu Pro | 22.04 LTS | Yes | 4KB | [MarketPlace](https://aws.amazon.com/marketplace/pp/prodview-uy7jg4dds3qjw) | Yes | 
 8 | Ubuntu | 24.04 LTS | Yes | 4KB | [noble](https://cloud-images.ubuntu.com/locator/ec2/) | Yes | 
 9 | Ubuntu | 22.04 LTS | Yes | 4KB | [jammy](https://cloud-images.ubuntu.com/locator/ec2/) | Yes | 
10 | Ubuntu | 20.04 LTS | Yes | 4KB | [focal](https://cloud-images.ubuntu.com/locator/ec2/) | Yes | 
11 | Ubuntu | 18.04 LTS | Yes (*) | 4KB | [bionic](https://cloud-images.ubuntu.com/locator/ec2/) | Yes | (*) needs `apt install libc6-lse`. Free support ended 2023/05/31.
12 | SuSE | 15 SP2 or later| Planned | 4KB | [MarketPlace](https://aws.amazon.com/marketplace/pp/B07SPTXBDX) | Yes | 
13 | Redhat Enterprise Linux | 8.2 or later | Yes | 64KB | [MarketPlace](https://aws.amazon.com/marketplace/pp/B07T2NH46P) | Yes | 
14 | ~~Redhat Enterprise Linux~~ | ~~7.x~~ | ~~No~~ | ~~64KB~~ | ~~[MarketPlace](https://aws.amazon.com/marketplace/pp/B07KTFV2S8)~~ | | Supported on A1 instances but not on Graviton2 and later based ones
15 | AlmaLinux | 8.4 or later | Yes | 64KB | [AMIs](https://wiki.almalinux.org/cloud/AWS.html) | Yes |
16 | Alpine Linux | 3.12.7 or later | Yes (*) | 4KB | [AMIs](https://www.alpinelinux.org/cloud/) | | (*) LSE enablement checked in version 3.14 |
17 | CentOS | 8.2.2004 or later | No | 64KB | [AMIs](https://wiki.centos.org/Cloud/AWS#Images) | Yes | |
18 | CentOS Stream | 8 | No (*) | 64KB (*) | [Downloads](https://www.centos.org/centos-stream/) | |(*) details to be confirmed once AMI's are available|
19 | ~~CentOS~~ | ~~7.x~~ | ~~No~~ | ~~64KB~~ | ~~[AMIs](https://wiki.centos.org/Cloud/AWS#Images)~~ | | Supported on A1 instances but not on Graviton2 and later based ones
20 | Debian | 12 | Yes | 4KB | [Community](https://wiki.debian.org/Cloud/AmazonEC2Image/Bookworm) or [MarketPlace](https://aws.amazon.com/marketplace/pp/prodview-63gms6fbfaota) | Yes |
21 | Debian | 11 | Yes | 4KB | [Community](https://wiki.debian.org/Cloud/AmazonEC2Image/Bullseye) or [MarketPlace](https://aws.amazon.com/marketplace/pp/prodview-jwzxq55gno4p4) | Yes |
22 | Debian | 10 | [Planned for Debian 11](https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=956418) | 4KB | [Community](https://wiki.debian.org/Cloud/AmazonEC2Image/Buster) or [MarketPlace](https://aws.amazon.com/marketplace/pp/B085HGTX5J) | Yes, as of Debian 10.7 (2020-12-07) |
23 | FreeBSD | 12.1 or later | No | 4KB | [Community](https://www.freebsd.org/releases/12.1R/announce.html) or [MarketPlace](https://aws.amazon.com/marketplace/pp/B081NF7BY7) | No | Device hotplug and API shutdown don't work
24 | FreeBSD | 13.0 or later | Yes | 4KB | [Community](https://www.freebsd.org/releases/13.0R/announce.html) or [MarketPlace](https://aws.amazon.com/marketplace/pp/B09291VW11) | No | Device hotplug and API shutdown don't work
25 | FreeBSD | 14.1 or later | Yes | 4KB | [Community](https://www.freebsd.org/releases/14.1R/announce/) or [MarketPlace](https://aws.amazon.com/marketplace/pp/prodview-axdyrrhr6pboq) | Yes |
26 |  Flatcar Linux | 3033.2.0 or later | Yes | 4KB | [AMIs](https://www.flatcar.org/docs/latest/installing/cloud/aws-ec2/) or [marketplace](https://aws.amazon.com/marketplace/pp/prodview-zmao5idgwafbi) | Yes | |
27 | Rocky Linux | 8.4 or later | Yes (*) | 64KB (*) | [ISOs](https://rockylinux.org/download) | | [Release Notes](https://docs.rockylinux.org/release_notes/8-changelog/)<br>(*) details to be confirmed once AMI's are available
28 | 
29 | 
30 | OS Name | Minimum recommended Linux kernel version for Graviton
31 | ------ | ------
32 | Amazon Linux 2023 | All supported kernels
33 | Amazon Linux 2 | >= 4.14.273-207.502.amzn2, >= 5.4.186-102.354.amzn2, or >= 5.10.106-102.504.amzn2
34 | Ubuntu 24.04 | All supported kernels
35 | Ubuntu 22.04 | All supported kernels 
36 | Ubuntu 20.04 | >= 5.4.0-1047-aws, >= 5.8.0-1034-aws, >= 5.11.0-1009-aws
37 | Ubuntu 18.04 | >= 4.15.0-1101-aws, >= 5.4.0-1047-aws
38 | Redhat Entreprise Linux 8 | >= 4.18.0-305.10
39 | 
40 | # Operating systems which do not support Graviton based instances
41 | 
42 |  Name | Version | 
43 | ------ | ------ |
44 | Microsoft Windows | All versions 
45 | 


--------------------------------------------------------------------------------
/perfrunbook/README.md:
--------------------------------------------------------------------------------
 1 | # Graviton Performance Runbook
 2 | 
 3 | ## Introduction
 4 | 
 5 | This document is a reference for software developers who want to benchmark, debug, and optimize their application code on AWS Graviton based instances.  It contains checklists, best practices, examples, and tooling collected by the EC2 Graviton team to assist with the tasks of benchmarking, debugging, or optimizing code on Graviton.
 6 | 
 7 | This document covers many topics including how to benchmark, how to debug performance and which optimization recommendations.  It is not meant to be read beginning-to-end. Instead view it as a collection of checklists and best known practices to apply when working with Graviton instances that go progressively deeper into analyzing the system.  Please see the FAQ below to direct you towards the most relevant set of checklists and tools depending on your specific situation.
 8 | 
 9 | If after following these guides there is still an issue you cannot resolve with regards to performance on Graviton based instances, please do not hesitate to raise an issue on the [AWS-Graviton-Getting-Started](https://github.com/aws/aws-graviton-getting-started/issues) guide or contact us at [ec2-arm-dev-feedback@amazon.com](mailto:ec2-arm-dev-feedback@amazon.com).  If there is something missing in this guide, please raise an issue or better, post a pull-request.
10 | 
11 | ## Pre-requisites
12 | 
13 | To assist with some of the tasks listed in this runbook, we have created some helper-scripts for some of the tasks the checklists describe.  The helper-scripts assume the test instances are running an up-to-date AL2, AL2023 or Ubuntu 20.04LTS/22.04LTS distribution and the user can run the scripts using `sudo`. Follow the steps below to obtain and install the utilities on your test systems:
14 | 
15 | ```bash
16 | # Clone the repository onto your systems-under-test and any load-generation instances
17 | git clone https://github.com/aws/aws-graviton-getting-started.git
18 | cd aws-graviton-getting-started/perfrunbook/utilities
19 | 
20 | # On AL2 or Ubuntu distribution
21 | sudo ./install_perfrunbook_dependencies.sh
22 | 
23 | # All scripts expect to run from the utilities directory
24 | ```
25 | 
26 | ## APerf for performance analysis
27 | 
28 | There is also a new tool aimed at helping move workloads over to Graviton called [APerf](https://github.com/aws/aperf), it bundles many of the capabilities of the individual tools present in this
29 | runbook and provides a better presentation.  It is highly recommended to download this tool and use it to gather most of the same information in one test-run.
30 | 
31 | ## Sections
32 | 
33 | 1. [Introduction to Benchmarking](./intro_to_benchmarking.md)
34 | 2. [Defining your benchmark](./defining_your_benchmark.md)
35 | 3. [Configuring your load generator](./configuring_your_loadgen.md)
36 | 4. [Configuring your system-under-test environment](./configuring_your_sut.md)
37 | 5. Debugging Performance
38 |     1. [Debugging performance — “What part of the system is slow?”](./debug_system_perf.md)
39 |     2. [Debugging performance — “What part of the code is slow?”](./debug_code_perf.md)
40 |     3. [Debugging performance — “What part of the hardware is slow?”](./debug_hw_perf.md)
41 | 6. [Optimizing performance](./optimization_recommendation.md)
42 | 7. [Appendix — Additional resources](./appendix.md)
43 | 8. [References](./references.md)
44 | 
45 | ## FAQ
46 | 
47 | * **I want to benchmark Graviton but I have yet to port my application, where do I find information on helping port my application?**
48 |     Our getting-started-guide has many resources to help with porting code to Graviton for a number of programming languages.  Start by reading those guides to understand minimum runtime, dependency and language requirements needed for Graviton.
49 | * **What benchmark should I run to determine if my application will be a good fit on Graviton?**
50 |     No synthetic benchmark is a substitute for your actual production code.  The best benchmark is running your production application on Graviton with a load that approximates your production load.  Please refer to [Section 1](./intro_to_benchmarking.md) and [2](./defining_your_benchmark.md) on how to benchmark and on selecting a benchmark for more details.
51 | * **I ran micro-benchmark X from github project Y and it shows Graviton has worse performance, does that mean my application is not a good fit?**
52 |     No.  Benchmarks only tell a limited story about performance, and unless this particular benchmark has been vetted as a good indicator for your application’s performance, we recommend running your production code as its own benchmark.  For more details, refer to [Section 1](./intro_to_benchmarking.md) and [2](./defining_your_benchmark.md) on how to define experiments and test Graviton for your needs.
53 | * **I benchmarked my service and performance on Graviton is slower compared to my current x86 based fleet, where do I start to root cause why?**
54 |     Begin by verifying software dependencies and verifying the configuration of your Graviton and x86 testing environments to check that no major differences are present in the testing environment.  Performance differences may be due to differences in environment and not the due to the hardware.  Refer to the below chart for a step-by-step flow through this runbook to help root cause the performance regression:
55 |     ![](./images/performance_debug_flowchart.png)
56 | * **What are the recommended optimizations to try with Graviton?**
57 |     Refer to [Section 6](./optimization_recommendation.md) for our recommendations on how to make your application run faster on Graviton.
58 | * **I investigated every optimization in this guide and still cannot find the root-cause, what do I do next?**
59 |     Please contact us at [ec2-arm-dev-feedback@amazon.com](mailto:ec2-arm-dev-feedback@amazon.com) or talk with your AWS account team representative to get additional help.
60 | 
61 | 


--------------------------------------------------------------------------------
/perfrunbook/configuring_your_loadgen.md:
--------------------------------------------------------------------------------
 1 | # Configuring your load generator
 2 | 
 3 | [Graviton Performance Runbook toplevel](./README.md)
 4 | 
 5 | The load generator setup is important to understand and verify: it generates the load that is expected.  An unknown load-generation setup can lead to not measuring the expected experiment and getting results that are hard to interpret. Below is a checklist to step through and verify the load generator is working as expected.
 6 | 
 7 | 1. Ensure the load generator instance is large enough for driving traffic to the Systems-under-test (SUTs), we recommend using 12xl instances or larger.  
 8 | 2. When generating load, verify the load-generator instance is not using 100% CPU for load-generators that use blocking IO.  For load-generators that busy poll, verify that it is spending ~50% of its time in the busy poll loop by utilizing `perf`  to verify where the load generator code is spending time. See [Section 5.b](./debug_code_perf.md) on how to generate CPU time profiles to verify this.  
 9 | 3. If the load-generator is close to its limit, measurements taken may be measuring the load-generator's ability to generate load and not the SUT's ability to process that load.  A load-generator that is spending less than 50% of its time generating load is a good target to ensure you are measuring the SUT.
10 | 4. When generating load, verify the load-generator is receiving valid responses and not a large number of errors from the SUT.  For example, the example below shows the [Wrk2](https://github.com/giltene/wrk2) load generator receiving many errors, meaning the test point is invalid:
11 |   ```bash 
12 |   Running 30s test @ http://127.0.0.1:80/index.html
13 |   2 threads and 100 connections
14 |   Thread calibration: mean lat.: 9747 usec, rate sampling interval: 21 msec
15 |     Thread calibration: mean lat.: 9631 usec, rate sampling interval: 21 msec
16 |     Thread Stats   Avg      Stdev     Max   +/- Stdev
17 |       Latency     6.46ms    1.93ms  12.34ms   67.66%
18 |       Req/Sec     1.05k     1.12k    2.50k    64.84%
19 |     60017 requests in 30.01s, 19.81MB read
20 |     Non-2xx or 3xx responses: 30131 
21 |   Requests/sec:   2000.15
22 |   Transfer/sec:    676.14KB
23 |   ```
24 | 5. Check `dmesg` and logs in `/var/log` on the SUT and load generator for indications of errors occurring when under load
25 | 6. Verify load generators and SUTs are physically close to remove sensitivities to RTT (Round-Trip-Times) of messages. Differences in RTT from a load-generator to SUTs can show up as lower throughput, higher latency percentiles and/or increase in error rates, so it is important to control this aspect for testing. Check the following aspects in the order provided to verify your network setup follows best practices and reduces the influence of network factors on test results:
26 |     1. Verify on EC2 console *Placement Group* is filled in with the same placement for the SUTs and load generators and that placement group is set to `cluster`.
27 |     2. Check in the EC2 console that all load generators are in the same subnet (i.e. us-east-1a)
28 |     3. Run `ping` from your SUTs to the load generators and any back-end services your SUT will communicate with. Verify the average latencies are similar (10s of micro-seconds).  You can also use `traceroute` to see the number of hops between your instances as well, ideally it should be 3 or less.
29 | 7. Check for ephemeral port exhaustion that may prevent load-generator from driving the desired traffic load:
30 |   ```bash
31 |   # Check on both the load generator/SUT machines
32 |   %> netstat -nt | awk '/ESTABLISHED|TIME_WAIT|FIN_WAIT2|FIN_WAIT1|CLOSE_WAIT/ {print $6}' | sort | uniq -c | sort -n
33 |   30000 TIME_WAIT
34 |     128 ESTABLISHED
35 | 
36 |   # If large numbers of connections are in TIME_WAIT, and number of ESTABLISHED is decreasing,
37 |   # port exhaustion is happening and can lead to decrease in driven load.  Use the below tips to
38 |   # fix the issue.
39 | 
40 |   # Increase ephemeral port range on load generator/SUT
41 |   %> sudo sysctl -w net.ipv4.ip_local_port_range 1024 65535
42 |   # If you application uses IPv6
43 |   %> sudo sysctl -w net.ipv6.ip_local_port_range 1024 65535
44 | 
45 |   # Allow kernel to re-use connections in load generator/SUT
46 |   %> sysctl -w net.ipv4.tcp_tw_reuse=1
47 |   ```
48 | 8. Check connection rates on SUT.  Do you see constant rate of new connections or bursty behavior? Does it match the expectations for the workload? 
49 |   ```bash
50 |   # Terminal on load-generator instance
51 |   %> <start test>
52 | 
53 |   # Terminal on SUT
54 |   %> cd ~/aws-graviton-getting-started/perfrunbook/utilities
55 |   %> python3 ./measure_and_plot_basic_sysstat_stats.py --stat new-connections --time 60
56 |   ```
57 | 9. If higher than expected connections/s are being observed, the cause of these new connections can be determined by looking at a packet trace and determining which end is initiating and closing the connections. 
58 |   ```bash
59 |   # On load-generator
60 |   %> <start test>
61 |         
62 |   # On load-generator
63 |   # Grab tcpdump from network device that will recieve traffic, likely 
64 |   # eth0, but check your configuration.
65 |   %> tcpdump -i eth<N> -s 128 -w dump.pcap
66 |   %> <stop tcpdump using Ctrl-C>
67 |   ```
68 | 10. Open the trace using a tool like [Wireshark](https://www.wireshark.org/#download). 
69 | 11. Look for which side is closing connections unexpectedly by looking for `Connection: Close` in the HTTP headers, or `FIN` in the TCP headers.  Identify which system is doing this more than expected.
70 | 12. Verify setup of the SUT and/or load generator for connection establishment behavior.
71 | 13.  Check packet rate, is it behaving as expected? i.e. constant rate of traffic, or bursty.
72 |   ```bash
73 |   # Load generator terminal #1
74 |   %> <start load generator or benchmark>
75 |     
76 |   # Terminal #2 on load generator
77 |   %> cd ~/aws-graviton-getting-started/perfrunbook/utilities
78 |   %> python3 ./measure_and_plot_basic_sysstat_stats.py --stat tcp-out-segments --time 60
79 |   %> python3 ./measure_and_plot_basic_sysstat_stats.py --stat tcp-in-segments --time 60
80 |   ```
81 | 14. Check for hot connections (i.e. connections that are more heavily used that others) by running: `watch netstat -t`. The below example shows the use of `netstat -t` to watch multiple TCP connections. One connection is active and has a non-zero `Send-Q` value while all other connections have a `Send-Q` value of 0. 
82 |   ```bash
83 |   %> watch netstat -t
84 |   Every 2.0s: netstat -t
85 |   ip-172-31-9-146: Tue Jan 12 23:01:35 2021
86 |       
87 |   Active Internet connections (w/o servers)
88 |   Proto Recv-Q Send-Q Local Address           Foreign Address         State
89 |   tcp        0      0 ip-172-31-9-146.ec2:ssh 72-21-196-67.amaz:62519 ESTABLISHED
90 |   tcp        0 345958 ip-172-31-9-146.ec2:ssh 72-21-196-67.amaz:25884 ESTABLISHED
91 |   tcp        0      0 ip-172-31-9-146.ec2:ssh 72-21-196-67.amaz:18144 ESTABLISHED
92 |   ```
93 | 15. Is the behavior expected?  If not, re-visit load-generator configuration.
94 | 


--------------------------------------------------------------------------------
/perfrunbook/defining_your_benchmark.md:
--------------------------------------------------------------------------------
 1 | # Defining your benchmark
 2 | 
 3 | [Graviton Performance Runbook toplevel](./README.md)
 4 | 
 5 | To define a benchmark there are two things to consider, the software running on the System-under-test (SUT) and how to drive load.  We recommend the software running on the SUT should be your production application. There is no better benchmark to predict performance than the actual production code.  If a synthetic proxy must be used to break dependencies of your application on external services such as authentication layers, then that proxy should be derived from the production code as much as possible.  We recommend avoiding synthetic benchmarks not related to the production code.  They are generally poor at predicting performance for another application or helping optimize it as they can over-target specific attributes of a system or exercise different bottlenecks than your application code might.
 6 | 
 7 | Once the code to test is selected, the proper traffic load to the SUT must be defined.  There are three primary ways to drive load to measure performance and also to debug performance issues: via a synthetic load generator, live traffic redirection, or live traffic duplication. After determining how to send traffic to your SUT, the next step is to determine the load point for the SUT.  Picking the proper load point will help with evaluating and debugging your application code on Graviton.  For most cases, the proper load point should be what is defined as peak load.  Below we describe two methods to pick the proper load point: the maximum number of operations per second or maximum throughput, and breaking latency.  
 8 | 
 9 | ## Maximum throughput
10 | 
11 | Maximum throughput tests the limits of the SUT, with no regard for response latency.  To test maximum throughput, have the load-generator increase load until all vCPUs on the SUT are operating at 100% with as little OS idle time measured as possible. This can be verified by using `sysstat` or  `htop` to see CPU utilization and by measuring at what settings on the load generator achieve the maximum score. If reaching 100% CPU utilization on the SUT is not possible, then find the peak throughput point reported with the load generator and record the settings used. If there are any bottlenecks present, they will be magnified when running at maximum throughput. After each optimization attempted, the maximum throughput point will have to re-evaluated.
12 | 
13 | ## Throughput at breaking latency
14 | 
15 | If latency and throughput have to be balanced, then a different methodology is needed.  In this case it is important to look for the breaking latency.  Breaking latency is the point when the machine can no longer serve more throughput and maintain acceptable response times to the load-generator and incrementally more throughput induces an exponential rise in latency.  An example of that exponential rise is below.
16 | 
17 | ![](images/example_breaking_latency_chart.png)
18 | 
19 | To find breaking latency, configure the load-generator to incrementally increase load while measuring latency and/or response failures to find the knee point in the latency/failure/throughput curve before it starts an exponential rise indicating the saturation point of the SUT. This usually requires a binary or linear search to refine where this point is located for each SUT.  Once these points are found, compare them and continue to use them for further experiments and debugging. After each successive optimization attempted, the saturation throughput point will have to be re-evaluated, and the goal is to shift the curve down and to the right, indicating more throughput and lower latency.
20 | 
21 | 
22 | ## Synthetic load generators
23 | 
24 | If you choose to use a synthetic load generator to drive your application, but need help finding a good candidate to use, please see the [Appendix](./appendix.md) to learn about the types of load generators and what we recommend to use.
25 | 
26 | With your benchmark fully defined, it is now time to move on to [Section 3](./configuring_your_loadgen.md), and start verifying the test environment and accounting for any variables present. 
27 | 


--------------------------------------------------------------------------------
/perfrunbook/images/example_breaking_latency_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/perfrunbook/images/example_breaking_latency_chart.png


--------------------------------------------------------------------------------
/perfrunbook/images/jmc_example_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/perfrunbook/images/jmc_example_image.png


--------------------------------------------------------------------------------
/perfrunbook/images/oncpu_example_flamgraph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/perfrunbook/images/oncpu_example_flamgraph.png


--------------------------------------------------------------------------------
/perfrunbook/images/performance_debug_flowchart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/perfrunbook/images/performance_debug_flowchart.png


--------------------------------------------------------------------------------
/perfrunbook/images/system-load/c6i.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/perfrunbook/images/system-load/c6i.png


--------------------------------------------------------------------------------
/perfrunbook/images/system-load/c7g-compared-to-c6i.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/perfrunbook/images/system-load/c7g-compared-to-c6i.png


--------------------------------------------------------------------------------
/perfrunbook/images/system-load/c7g.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-graviton-getting-started/b3b7379942979a1d02c3ac68610c2507359781c9/perfrunbook/images/system-load/c7g.png


--------------------------------------------------------------------------------
/perfrunbook/intro_to_benchmarking.md:
--------------------------------------------------------------------------------
 1 | # Quick introduction to benchmarking
 2 | 
 3 | [Graviton Performance Runbook toplevel](./README.md)
 4 | 
 5 | When designing an experiment to benchmark Graviton based instances against another instance type, it is key to remember the below 2 guiding principles:
 6 | 
 7 | 1. Always define a specific question to answer with your benchmark
 8 | 2. Control your variables and unknowns within the benchmark environment
 9 | 
10 | ### Ask the right question
11 | 
12 | The first bullet point is the most important; without a specific question to answer, any benchmarking data gathered will be hard to interpret and provide limited conclusions.  For example, a poor question is this:
13 | 
14 | > “How do Graviton instances compare to x86 based instances on Java?”
15 | 
16 | This type of question is impossible to answer.  The primary problem is that there are too many variables present in that simple question, for instance: What Java version? What application? What load? What is the metric of interest?  What OS distribution? What software dependencies?  Because the scope of the question is so vast, any data gathered and any conclusion made from it will be immediately questioned and discarded as simple outliers from outside reviewers.  Leading to a great deal of wasted effort. 
17 | 
18 | Instead formulate the question as tightly as possible:
19 | 
20 | > “How does a Graviton instance’s request throughput compare to current instances on my Java application at a P99 of 10ms for a mix of 60% GETS and 40% PUTS on Ubuntu 20.04LTS?
21 | 
22 | This question is narrower in scope and defines a specific experiment that can be answered conclusively without a mountain of data.  Always ask specific questions so as to get specific answers and conclusions.  
23 | 
24 | ### Less moving parts
25 | 
26 | The second bullet follows from the first.  Once a specific question is posed for an experiment, then it is required to account for variables in the experimental environment as much as possible.   
27 | 
28 | Variables for benchmarking an application can include: 
29 | - OS distribution
30 | - Linux kernel version
31 | - software dependency versions
32 | - instance size used
33 | - network placement group configuration
34 | - application setup
35 | - background daemons
36 | - traffic profiles
37 | - load generator behavior 
38 | - etc.  
39 | 
40 | The more variables that are controlled for, more specific questions can be answered about the system.  Such as, if an instance is over-provisioned with disk and network bandwidth and it is known that this configuration will not pose a bottleneck, then experiments can be derived to test only the capability of the CPU and DRAM in the system.  
41 | 
42 | It is recommended before running an experiment to fully understand all the ways the environment can vary and determine how and if they can be controlled for.  The above list can be used as a starting point.  Having a thorough understanding of all the variables present in an experiment will enable better analysis of results and reduce the number of experimental runs needed before settling on the final configurations that will enable performance debugging.  
43 | 
44 | Now that you have a specific question to answer, and have a basic understanding of the variables to control for, lets get started with defining how to test your application to assist with debugging performance in [Section 2.](./defining_your_benchmark.md)
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/perfrunbook/optimization_recommendation.md:
--------------------------------------------------------------------------------
 1 | # Optimizing performance
 2 | 
 3 | [Graviton Performance Runbook toplevel](./README.md)
 4 | 
 5 | This section describes multiple different optimization suggestions to try on Graviton based instances to attain higher performance for your service.  Each sub-section defines some optimization recommendations that can help improve performance if you see a particular signature after measuring the performance using the previous checklists.
 6 | 
 7 | ## Optimizing for large instruction footprint
 8 | 
 9 | 1. On C/C++ applications, `-flto`, `-Os`, and [Feedback Directed Optimization](https://gcc.gnu.org/wiki/AutoFDO/Tutorial) can help with code layout using GCC.
10 | 2. On Java, `-XX:-TieredCompilation`, `-XX:ReservedCodeCacheSize` and `-XX:InitialCodeCacheSize` can be tuned to reduce the pressure the JIT places on the instruction footprint. The JDK defaults to setting up a 256MB region by default for the code-cache which over time can fill, become fragmented, and live code may become sparse.
11 |     1. We recommend setting the code cache initially to: `-XX:-TieredCompilation -XX:ReservedCodeCacheSize=64M -XX:InitialCodeCacheSize=64M` and then tuning the size up or down as required.
12 |     2. Experiment with setting `-XX:+TieredCompilation` to gain faster start-up time and better optimized code.
13 |     3. When tuning the code JVM code cache, watch for `code cache full` error messages in the logs indicating that the cache has been set too small.  A full code cache can lead to worse performance.
14 | 
15 | ## Optimizing for high TLB miss rates
16 | 
17 | A TLB (translation lookaside buffer) is a cache that holds recent virtual address to physical address translations for the CPU to use.  Making sure this cache never misses can improve application performance.
18 | 
19 | 1. Enable Transparent Huge Pages (THP)
20 |      `echo always > /sys/kernel/mm/transparent_hugepage/enabled` -or- `echo madvise > /sys/kernel/mm/transparent_hugepage/enabled`
21 | 2. On Linux kernels >=6.9 Transparent Huge Pages (THP) has been extended with [Folios](https://lwn.net/Articles/937239/) that create 16kB, and 64kB huge pages in addition to 2MB pages. This allows the Linux kernel to use huge pages in more places to increase performance by reducing TLB pressure.  All folio sizes can be set using `inherit` to use the setting of the top-level THP setting, or set independently to select the sizes to use.  Can also set each folio using `never`, `always` and `madvise`.
22 |    1. To use 16kB pages: `echo inherit > /sys/kernel/mm/transparent_hugepage/hugepages-16kB/enabled`
23 |    2. To use 64kB pages: `echo inherit > /sys/kernel/mm/transparent_hugepage/hugepages-64kB/enabled`
24 |    3. To use 2MB pages: `echo inherit > /sys/kernel/mm/transparent_hugepage/hugepages-2048kB/enabled`
25 | 3. If your application can use pinned hugepages because it uses mmap directly, try reserving huge pages directly via the OS.  This can be done by two methods.
26 |     1. At runtime: `sysctl -w vm.nr_hugepages=X`
27 |     2. At boot time by specifying on the kernel command line in `/etc/default/grub`: `hugepagesz=2M hugepages=512`
28 | 4. For Java, hugepages can be used for both the code-heap and data-heap by adding the below flags to your JVM command line
29 |    1. `-XX:+UseTransparentHugePages` when THP is set to at least `madvise`
30 |    2. `-XX:+UseLargePages` if you have pre-allocated huge pages through `sysctl` or the kernel command line.
31 | 
32 | Using huge-pages should generally improve performance on all EC2 instance types, but there can be cases where using exclusively
33 | huge-pages may lead to performance degradation.  Therefore, it is always recommended to fully test your application after enabling and/or
34 | allocating huge-pages.
35 | 
36 | ## Porting and optimizing assembly routines
37 | 
38 | 1. If you need to port an optimized routine that uses x86 vector instruction instrinsics to Graviton’s vector instructions (called NEON instructions), you can use the [SSE2NEON](https://github.com/DLTcollab/sse2neon) library to assist in the porting.  While SSE2NEON won’t produce optimal code, it generally gets close enough to reduce the performance penalty of not using the vector intrinsics.
39 | 2. For additional information on the vector instructions used on Graviton
40 |     1. [Arm instrinsics guide](https://developer.arm.com/architectures/instruction-sets/intrinsics/)
41 |     2. [Graviton2 core software optimization guide](https://developer.arm.com/documentation/pjdoc466751330-9707/2-0)
42 |     3. [Graviton3 core software optimization guide](https://developer.arm.com/documentation/pjdoc466751330-9685/latest/)
43 |     4. [Graviton4 core software optimization guide](https://developer.arm.com/documentation/PJDOC-466751330-593177/latest/)
44 | 
45 | ## Optimizing synchronization heavy optimizations
46 | 
47 | 1. Look for specialized back-off routines for custom locks tuned using x86 `PAUSE` or the equivalent x86 `rep; nop` sequence. Graviton2 should use a single `ISB` instruction as a drop in replacement, for an example and explanation see recent commit to the [Wired Tiger storage layer](https://github.com/wiredtiger/wiredtiger/pull/6080/files#diff-08a92383c3904f531b067c488d6d6e34ddad0e3008313982b1b0712c0c3a7598).
48 | 2. If a locking routine tries to acquire a lock in a fast path before forcing the thread to sleep via the OS to wait, try experimenting with modifying the fast path to attempt the fast path a few additional times before executing down the slow path. [An example of this from the Finagle code-base where on Graviton2 we will spin longer for a lock before sleeping](https://github.com/twitter/finagle/blob/develop/finagle-stats-core/src/main/scala/com/twitter/finagle/stats/NonReentrantReadWriteLock.scala).
49 | 3. If you do not intend to run your application on Graviton1, try compiling your code on GCC using `-march=armv8.2-a` instead of using `-moutline-atomics` to reduce overhead of using synchronization builtins.
50 | 
51 | ## Network heavy workload optimizations
52 | 
53 | 1. Check ENA device tunings with `ethtool -c ethN` where `N` is the device number and check `Adaptive RX` setting. By default on instances without extra ENI’s this will be `eth0`.
54 |     1. Set to `ethtool -C ethN adpative-rx off` for a latency sensitive workload
55 |     2. ENA tunings via `ethtool` can be made permanent by editing `/etc/sysconfig/network-scripts/ifcfg-ethN` files.
56 | 2. Disable `irqbalance` from dynamically moving IRQ processing between vCPUs and set dedicated cores to process each IRQ.  Example script below:
57 |   ```bash
58 |   # Assign eth0 ENA interrupts to the first N-1 cores
59 |   systemctl stop irqbalance
60 |     
61 |   irqs=$(grep "eth0-Tx-Rx" /proc/interrupts | awk -F':' '{print $1}')
62 |   cpu=0
63 |   for i in $irqs; do
64 |     echo $cpu > /proc/irq/$i/smp_affinity_list
65 |     let cpu=${cpu}+1
66 |   done
67 |   ```
68 | 3. Disable Receive Packet Steering (RPS) to avoid contention and extra IPIs. 
69 |     1.  `cat /sys/class/net/ethN/queues/rx-N/rps_cpus` and verify they are set to `0`. In general RPS is not needed on Graviton2 and newer.
70 |     2. You can try using RPS if your situation is unique.  Read the [documentation on RPS](https://www.kernel.org/doc/Documentation/networking/scaling.txt) to understand further how it might help. Also refer to [Optimizing network intensive workloads on Amazon EC2 A1 Instances](https://aws.amazon.com/blogs/compute/optimizing-network-intensive-workloads-on-amazon-ec2-a1-instances/) for concrete examples.
71 | 
72 | ## Metal instance IO optimizations
73 | 
74 | 1. If on Graviton2 and newer metal instances, try disabling the System MMU (Memory Management Unit) to speed up IO handling:
75 |   ```bash
76 |   %> cd ~/aws-gravition-getting-started/perfrunbook/utilities
77 |   # Configure the SMMU to be off on metal, which is the default on x86.
78 |   # Leave the SMMU on if you require the additional security protections it offers.
79 |   # Virtualized instances do not expose an SMMU to instances.
80 |   %> sudo ./configure_graviton_metal_iommu.sh off
81 |   %> sudo shutdown now -r
82 |   ```
83 | 
84 | 


--------------------------------------------------------------------------------
/perfrunbook/references.md:
--------------------------------------------------------------------------------
 1 | # References
 2 | 
 3 | [Graviton Performance Runbook toplevel](./README.md)
 4 | 
 5 | Experimental design:
 6 | 
 7 | * [Your load generator is probably lying to you](http://highscalability.com/blog/2015/10/5/your-load-generator-is-probably-lying-to-you-take-the-red-pi.html)
 8 | * [NIST Engineering Statistics: Choosing an experimental design](https://www.itl.nist.gov/div898/handbook/pri/section3/pri3.htm)
 9 | 
10 | Performance measurement tools:
11 | 
12 | * [Top-down performance analysis](https://drive.google.com/file/d/0B_SDNxjh2Wbcc0lWemFNSGMzLTA/view)
13 | * [Brendan Gregg's homepage](http://www.brendangregg.com/overview.html)
14 | * [Flamegraph homepage](https://github.com/brendangregg/FlameGraph)
15 | * https://github.com/andikleen/pmu-tools
16 | * [Netstat man-page](https://linux.die.net/man/8/netstat)
17 | * [Sar man-page](https://linux.die.net/man/1/sar)
18 | * [perf-stat man-page](https://linux.die.net/man/1/perf-stat)
19 | * [perf-record man-page](https://linux.die.net/man/1/perf-record)
20 | * [perf-annotate man-page](https://linux.die.net/man/1/perf-annotate)
21 | 
22 | Optimization and tuning:
23 | 
24 | * [GCC10 manual](https://gcc.gnu.org/onlinedocs/gcc-10.2.0/gcc.pdf)
25 | * [Optimizing network intensive workloads on Graviton1](https://aws.amazon.com/blogs/compute/optimizing-network-intensive-workloads-on-amazon-ec2-a1-instances/)
26 | * [Optimizing NGINX on Graviton1](https://aws.amazon.com/blogs/compute/optimizing-nginx-load-balancing-on-amazon-ec2-a1-instances/)
27 | * [sysctl tunings](https://github.com/amazonlinux/autotune/blob/master/src/ec2sys_autotune/ec2_instance_network_cfg_gen.py#L63-L64)
28 | * [AL2 auto-tuning](https://github.com/amazonlinux/autotune)
29 | 
30 | Hardware reference manuals:
31 | 
32 | * [Arm64 Architecture Reference Manual](https://developer.arm.com/documentation/102105/latest)
33 | * [Neoverse N1 Technical Reference Manual](https://developer.arm.com/documentation/100616/0400/debug-descriptions/performance-monitor-unit/pmu-events)
34 | * Reference for Intel CPU PMU counters (c5/m5/r5): [Intel 64 and IA-32 Architecture Software Developer’s Manual, Volume 3B Chapter 19](https://software.intel.com/content/dam/develop/external/us/en/documents-tps/253669-sdm-vol-3b.pdf)
35 | * Reference for AMD CPU PMU counters (c5a): [Processor Programming Reference (PPR) for AMD Family 17h Model 71h, Revision B0 Processors Section 2.1.15](https://www.amd.com/system/files/TechDocs/56176_ppr_Family_17h_Model_71h_B0_pub_Rev_3.06.zip)
36 | 
37 | 


--------------------------------------------------------------------------------
/perfrunbook/system-load-and-compute-headroom.md:
--------------------------------------------------------------------------------
 1 | # System Load and Compute Headroom
 2 | 
 3 | Ideally, a system's performance should scale linearly as more vCPUs are used, up to 100% CPU load.
 4 | This ideal is not always realized due to how vCPUs are implemented on x86 and Graviton based EC2 instances.
 5 | Our x86 based EC instances in AWS EC2 execute two hardware threads per core, each thread representing a vCPU.
 6 | Graviton processors can execute only one hardware thread, or vCPU, per physical core.
 7 | The Linux kernel schedules tasks to cores first, then, when all physical cores are used, to hardware threads.
 8 | When hardware threads have to share one physical core, the per-thread performance decreases significantly.
 9 | This leads to x86 and Graviton systems to scale differently under increasing CPU load.
10 | We recommend to always load test your application on Graviton to determine the CPU load limits your application
11 | needs to maintain for quality of service, fail-over resilience etc., as they will likely be different between Graviton and x86.
12 | To illustrate how Graviton behaves differently relative to CPU load, we provide two short case-studies below.
13 | 
14 | ### Experimental Setup
15 | 
16 | The test systems are limited to eight cores to limit the maximum packet load needed to saturate them.
17 | This is to exclude the influence of the networking infrastructure on the experiment.
18 | All systems run Amazon Linux 2 with Linux kernel 5.10.118.
19 | The test system are virtualized instances that use EBS networked storage.
20 | 
21 | #### First experiment
22 | 
23 | The systems to be tested are an 8 vCPU Intel instance, c6i.2xlarge, and an 8 vCPU Graviton 3 instance, c7g.2xlarge.
24 | OpenSSL source is placed and configured in eight separate directories.
25 | The test then compiles OpenSSL one to eight times in parallel, limited to one vCPU per compile.
26 | An ideal system would take the exact same time for each compile run.
27 | 
28 | Results:
29 | 
30 | The Graviton 3 achieves 96% of the single compile time performance, single CPU baseline when using 100% of the available CPUs.
31 | The Intel system achieves 63% of the compile time performance of the single compile, single CPU, baseline when loaded to 100%.
32 | 
33 | ![](images/system-load/c7g-compared-to-c6i.png)
34 | 
35 | Note:
36 | The average compile time of the c6i instance is 268s, close to the 250s it took on c7g.
37 | The Linux scheduler picks cores over threads, when available, which shows nicely in the steep increase in compile time of the c6i beyond 50% load.
38 | For the c6i to maintain the same response time as the c7g, it would need more vCPUs. It can stay on par until 5 of the 8 vCPUs are used.
39 | 
40 | #### Second experiment
41 | 
42 | The second experiment adds a 64 core Graviton 3, c7g.16xlarge, that serves as a load generator running wrk2.
43 | Test systems are an 8 CPU c7g.2xlarge Graviton 3 and an 8 vCPU c6i.2xlarge Intel Xeon.
44 | Here a simple HTTP server, implemented using the Netty/Java framework, is the workload on the systems under test.
45 | Latency vs actual packets processed is captured alongside the CPU load of the systems, as reported by /proc/stat.
46 | Of particular interest is the performance at 50% and at maximum sustained load.
47 | Maximum sustained load is where the response latency is no longer than two times the response time at <20% CPU load.
48 | This is the reason why the plots stop shy of 100%.
49 | 
50 | Results:
51 | 
52 | ![](images/system-load/c7g.png)
53 | 
54 | For this workload, the non-SMT Graviton 3 system performs better than linear:
55 | 644506 packet/s would be expected as maximum throughput but 736860 packets/s where handled, thus 14% better than expected.
56 | 
57 | 
58 | ![](images/system-load/c6i.png)
59 | 
60 | The Intel SMT system degrades when loaded close to 100%. It achieves 86% of the performance it should have shown, based on the 50% number: 690276 packet/s would be expected, 591411 packets/s where actually handled.
61 | At the 50% CPU load mark, a c6i can handle an additional 71% of the traffic it did up to 50% (345k,246k), whereas the c7g is able to serve another 130% of that (314k,423k).
62 | The c6i would need 2 additional vCPUs to be on par with c7g in packets/s.
63 | 
64 | #### Conclusion
65 | 
66 | Graviton instances compute performance increases near-linearly with CPU load, x86 performance increases less after 50% CPU load. This is because our x86 based EC2 instances employ symmetric multithreading, aka. 'Hyperthreading'. Based on the above, load balancer thresholds can, in many cases, be set higher on Graviton instances than on x86-type instances and thus lead to significant savings in the size of the required server fleet, as the Netty example shows.
67 | Since every workload has different demands on the system, a full load sweep should done to determine best system type and at which threshold additional instances need to be added to maintain performance.
68 | 
69 | 


--------------------------------------------------------------------------------
/perfrunbook/utilities/capture_flamegraphs.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | script_dir=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
  5 | 
  6 | let capture_freq=99
  7 | reverse=
  8 | 
  9 | # search replace filter that will combine thread names like
 10 | # GC-Thread-1 to GC-Thread- in perf-script sample header lines.
 11 | sr_filter='s/^([a-zA-Z\-]+)[0-9]*-?([a-zA-z]*) (.*?)$/\1\2 \3/g'
 12 | 
 13 | help_msg() {
 14 |  echo "Requires perf to be installed and in your PATH"
 15 |  echo "usage: $0 oncpu|offcpu|<custom_event> [time seconds|default 300] --f|--frequency 99 -R|--Reverse -r|--regexfilter 's/hi/bye/g'"
 16 |  echo "custom_event is any event listed in perf-list"
 17 | }
 18 | 
 19 | process_perf_data () {
 20 |   perf inject -j -i perf.data -o perf.data.jit
 21 |   perf script -f -i perf.data.jit > script.out
 22 |   if [[ ! -z "${sr_filter}" ]]; then
 23 |     perl -pi -e "${sr_filter}" script.out
 24 |   fi
 25 |   "$script_dir/FlameGraph/stackcollapse-perf.pl" --kernel --jit script.out > folded.out
 26 |   "$script_dir/FlameGraph/flamegraph.pl" ${reverse:+--reverse} --colors java folded.out > flamegraph_$1_$4_$3_$2.svg
 27 |   rm perf.data perf.data.jit script.out folded.out
 28 | }
 29 | 
 30 | capture_on_cpu () {
 31 |   perf record -a -g -k 1 -F${capture_freq} -e cpu-clock:pppH -- sleep $1
 32 |   process_perf_data "oncpu" $2 $3 $4
 33 | }
 34 | 
 35 | capture_off_cpu () {
 36 |   perf record -a -g -k 1 -F${capture_freq} -e sched:sched_switch -- sleep $1
 37 |   process_perf_data "offcpu" $2 $3 $4
 38 | }
 39 | 
 40 | capture_custom_event () {
 41 |   perf record -a -g -k 1 -F${capture_freq} -e $1 -- sleep $2
 42 |   process_perf_data $1 $3 $4 $5
 43 | }
 44 | 
 45 | if [[ $(id -u) -ne 0 ]]; then
 46 |   echo "Must be run with sudo privileges"
 47 |   exit 1
 48 | fi
 49 | 
 50 | # Test perf exists
 51 | test_perf=$(perf list &> /dev/null)
 52 | if [[ $? -ne 0 ]]; then
 53 |   help_msg
 54 |   exit 1
 55 | fi
 56 | 
 57 | POSITIONAL=()
 58 | while [[ $# -gt 0 ]]
 59 | do
 60 |   key="$1"
 61 |   case $key in
 62 |     -f|--frequency)
 63 |       let capture_freq="$2"
 64 |       shift; shift
 65 |       ;;
 66 |     -r|--regexfilter)
 67 |       sr_filter="$2"
 68 |       shift; shift
 69 |       ;;
 70 |     -R|--Reverse)
 71 |       let reverse=1
 72 |       shift;
 73 |       ;;
 74 |     -h|--help)
 75 |       help_msg
 76 |       exit 1
 77 |       ;;
 78 |     *)
 79 |       POSITIONAL+=("$1")
 80 |       shift
 81 |       ;;
 82 |   esac
 83 | done
 84 | set -- "${POSITIONAL[@]}"
 85 | 
 86 | if [[ $# -lt 1 ]]; then
 87 |   help_msg
 88 |   exit 1
 89 | fi
 90 | 
 91 | capture_time=300
 92 | if [[ $# -gt 1 ]]; then
 93 |   capture_time=$2
 94 | fi
 95 | 
 96 | date=$(date "+%Y-%m-%d_%H:%M:%S")
 97 | # Try to get meta-data using IMDSv2
 98 | if token=$(curl --max-time 1 -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") ; then
 99 |   instance_type=$(curl -s -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/meta-data/instance-type)
100 |   instance_id=$(curl -s -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/meta-data/instance-id)
101 | else
102 |   instance_type="unknown"
103 |   instance_id="unknown"
104 | fi
105 | 
106 | if [[ "$1" == "oncpu" ]]; then
107 |   capture_on_cpu $capture_time $date $instance_type $instance_id
108 | elif [[ "$1" == "offcpu" ]]; then
109 |   capture_off_cpu $capture_time $date $instance_type $instance_id
110 | else
111 |   capture_custom_event $1 $capture_time $date $instance_type $instance_id
112 | fi
113 | 


--------------------------------------------------------------------------------
/perfrunbook/utilities/configure_graviton_metal_iommu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | grub_loc=/etc/default/grub
 4 | #grub_loc=test
 5 | 
 6 | help_msg() {
 7 |   echo "usage: $0 on|off"
 8 | }
 9 | 
10 | set -e
11 | 
12 | enable_iommu() {
13 |   perl -pi -e 's/GRUB_CMDLINE_LINUX="((.(?!iommu.strict=[01]))*)"$/GRUB_CMDLINE_LINUX="\1 iommu.strict=1"/g' $grub_loc
14 |   perl -pi -e 's/GRUB_CMDLINE_LINUX="(.*?) iommu.strict=[01](.*?)"$/GRUB_CMDLINE_LINUX="\1\2 iommu.strict=1"/g' $grub_loc
15 |   perl -pi -e 's/GRUB_CMDLINE_LINUX="((.(?!iommu.passthrough=[01]))*)"$/GRUB_CMDLINE_LINUX="\1 iommu.passthrough=0"/g' $grub_loc
16 |   perl -pi -e 's/GRUB_CMDLINE_LINUX="(.*?) iommu.passthrough=[01](.*?)"$/GRUB_CMDLINE_LINUX="\1\2 iommu.passthrough=0"/g' $grub_loc
17 | }
18 | 
19 | disable_iommu() {
20 |   perl -pi -e 's/GRUB_CMDLINE_LINUX="((.(?!iommu.strict=[01]))*)"$/GRUB_CMDLINE_LINUX="\1 iommu.strict=0"/g' $grub_loc
21 |   perl -pi -e 's/GRUB_CMDLINE_LINUX="(.*?) iommu.strict=[01](.*?)"$/GRUB_CMDLINE_LINUX="\1\2 iommu.strict=0"/g' $grub_loc
22 |   perl -pi -e 's/GRUB_CMDLINE_LINUX="((.(?!iommu.passthrough=[01]))*)"$/GRUB_CMDLINE_LINUX="\1 iommu.passthrough=1"/g' $grub_loc
23 |   perl -pi -e 's/GRUB_CMDLINE_LINUX="(.*?) iommu.passthrough=[01](.*?)"$/GRUB_CMDLINE_LINUX="\1\2 iommu.passthrough=1"/g' $grub_loc
24 | }
25 | 
26 | update_grub() {
27 |   if [[ -d /boot/grub2 ]]; then
28 |     grub2-mkconfig -o /boot/grub2/grub.cfg
29 |   else
30 |     grub2-mkconfig -o /boot/grub/grub.cfg
31 |   fi 
32 | }
33 | 
34 | if [[ $# -ne 1 ]]; then
35 |   help_msg
36 |   exit 1
37 | fi
38 | 
39 | if [[ $(id -u) -ne 0 ]]; then
40 |   echo "Must be run with sudo privileges"
41 |   exit 1
42 | fi
43 | 
44 | cp $grub_loc ${grub_loc}.bak
45 | case $1 in
46 |   on)
47 |     enable_iommu
48 |     update_grub
49 |     ;;
50 |   off)
51 |     disable_iommu
52 |     update_grub
53 |     ;;
54 |   *)
55 |     rm ${grub_loc}.bak
56 |     help_msg
57 |     exit 1
58 | esac
59 | 
60 | echo "To make changes take effect run: %> sudo shutdown now -r"
61 | 


--------------------------------------------------------------------------------
/perfrunbook/utilities/configure_mem_size.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | grub_loc=/etc/default/grub
 4 | #grub_loc=test
 5 | 
 6 | help_msg() {
 7 |   echo "usage: $0 <size in GB>"
 8 | }
 9 | 
10 | set -e
11 | 
12 | config_mem () {
13 |   perl -pi -e 's/GRUB_CMDLINE_LINUX="((.(?!mem=[0-9]+[KMG]))*)"$/GRUB_CMDLINE_LINUX="\1 mem='"${1}"'G"/g' $grub_loc
14 |   perl -pi -e 's/GRUB_CMDLINE_LINUX="(.*?) mem=[0-9]+[KMG](.*?)"$/GRUB_CMDLINE_LINUX="\1\2 mem='"${1}"'G"/g' $grub_loc
15 | }
16 | 
17 | update_grub() {
18 |   if [[ -d /boot/grub2 ]]; then
19 |     grub2-mkconfig -o /boot/grub2/grub.cfg
20 |   else
21 |     grub2-mkconfig -o /boot/grub/grub.cfg
22 |   fi 
23 | }
24 | 
25 | if [[ $# -ne 1 ]]; then
26 |   help_msg
27 |   exit 1
28 | fi
29 | 
30 | if [[ $(id -u) -ne 0 ]]; then
31 |   echo "Must be run with sudo privileges"
32 |   exit 1
33 | fi
34 | 
35 | cp $grub_loc ${grub_loc}.bak
36 | 
37 | config_mem $1
38 | update_grub
39 | 
40 | echo "To make changes take effect run: %> sudo shutdown now -r"
41 | 


--------------------------------------------------------------------------------
/perfrunbook/utilities/configure_vcpus.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | help_msg() {
 4 |   echo "usage: $0 [cpus] [default=threads|cores]"
 5 | }
 6 | 
 7 | if [[ $# -lt 1 ]]; then
 8 |   help_msg
 9 |   exit 1
10 | fi
11 | 
12 | if [[ $(id -u) -ne 0 ]]; then
13 |   echo "Must be run with sudo privileges"
14 |   exit 1
15 | fi
16 | 
17 | if [[ "$2" == "threads" && $(uname -m) == "aarch64" ]]; then
18 |   echo "Can only specify 'cores' for hotplugging cpus on Graviton"
19 |   exit 1
20 | fi
21 | 
22 | # Re-enable all CPUs then modify the list
23 | all_vcpus=$(lscpu -a -p=CPU | awk -F',' '$0 ~ /^[[:digit:]]+$/{print $1}')
24 | for i in $all_vcpus; do
25 |   echo 1 > /sys/devices/system/cpu/cpu$i/online
26 | done
27 | 
28 | # Parse lscpu output to select the subset of cores/threads to hotplug in/out
29 | case $2 in
30 |   threads)
31 |     let physical_cores=$1/2
32 |     vcpus_on=$(lscpu -p=CPU,CORE | awk -F',' 'BEGIN{j='"${physical_cores}"'}$0 ~ /^[[:digit:]]+,[[:digit:]]+$/{if ($2<j) {print $1}; i++}')
33 |     vcpus_off=$(lscpu -p=CPU,CORE | awk -F',' 'BEGIN{j='"${physical_cores}"'}$0 ~ /^[[:digit:]]+,[[:digit:]]+$/{if ($2>=j) {print $1}; i++}')
34 |     ;;
35 |   cores)
36 |     let physical_cores=$1
37 |     vcpus_on=$(lscpu -p=CPU,CORE | awk -F',' 'BEGIN{j='"${physical_cores}"'}$0 ~ /^[[:digit:]]+,[[:digit:]]+$/{if ($1<j) {print $1}; i++}')
38 |     vcpus_off=$(lscpu -p=CPU,CORE | awk -F',' 'BEGIN{j='"${physical_cores}"'}$0 ~ /^[[:digit:]]+,[[:digit:]]+$/{if ($1>=j) {print $1}; i++}')
39 |     ;;
40 |   *)
41 |     echo "Do not recognize option: $2"
42 |     exit 1
43 | esac
44 | 
45 | for i in $vcpus_on; do
46 |   echo 1 > /sys/devices/system/cpu/cpu$i/online
47 | done
48 | 
49 | for i in $vcpus_off; do
50 |   echo 0 > /sys/devices/system/cpu/cpu$i/online
51 | done
52 | 
53 | # Update Docker cpusets to reflect cpus that are hotplugged in/out to allow
54 | # restarted containers so they see the proper number of CPUs
55 | FILE=/sys/fs/cgroup/cpuset/docker/cpuset.cpus
56 | if [ -f "$FILE" ]; then
57 |   cp /sys/fs/cgroup/cpuset/cpuset.cpus /sys/fs/cgroup/cpuset/docker/cpuset.cpus
58 | fi
59 | 


--------------------------------------------------------------------------------
/perfrunbook/utilities/find_and_list_jar_with_so.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $(id -u) -ne 0 ]]; then
 4 |   echo "Must run with sudo privileges"
 5 |   exit 1
 6 | fi
 7 | 
 8 | set -e
 9 | 
10 | i=$(find / -name '*.jar' -type f -print)
11 | i=($i)
12 | 
13 | mkdir -p /tmp/jar_dir
14 | pushd /tmp/jar_dir
15 | 
16 | for jar in ${i[@]}; do
17 |   cp $jar .
18 |   jar_name=$(basename ${jar})
19 |   unzip -qq ${jar_name}
20 |   find . -name '*.so' -type f -print
21 |   rm -rf *
22 | done
23 | 
24 | rm -rf * /tmp/jar_dir
25 | popd
26 | 


--------------------------------------------------------------------------------
/perfrunbook/utilities/find_and_list_pylib_with_so.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $(id -u) -ne 0 ]]; then
 4 |   echo "Must run with sudo privileges"
 5 |   exit 1
 6 | fi
 7 | 
 8 | py_ver=$(python3 --version | awk '{print $2}' | awk -F'.' '{print $1"."$2}' | tr -d [:cntrl:])
 9 | if [[ $# -ge 1 ]]; then
10 |   py_ver=$1
11 | fi
12 |   
13 | 
14 | i=$(find / -wholename "*python${py_ver}/site-packages" -type d -print)
15 | i=($i)
16 | 
17 | cwd=$(pwd)
18 | 
19 | for pylib in ${i[@]}; do
20 |   cd ${pylib}
21 |   find . -name '*.so' -type f -print
22 |   cd $cwd
23 | done
24 | 
25 | cd $cwd
26 | 


--------------------------------------------------------------------------------
/perfrunbook/utilities/install_bcc_tools_al2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $(id -u) -ne 0 ]]; then
 4 |   echo "Must run with sudo privileges"
 5 |   exit 1
 6 | fi
 7 | 
 8 | echo "------ INSTALLING BCC PERFORAMANCE TOOLS ------"
 9 | amazon-linux-extras enable BCC
10 | yum install -y -q perf kernel-devel-$(uname -r) bcc
11 | 


--------------------------------------------------------------------------------
/perfrunbook/utilities/install_perfrunbook_dependencies.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | install_al2023_dependencies () {
  4 |   echo "------ INSTALLING UTILITIES ------"
  5 |   dnf install -y -q vim unzip git
  6 | 
  7 |   echo "------ INSTALLING HIGH LEVEL PERFORMANCE TOOLS ------"
  8 |   dnf install -y -q sysstat htop hwloc tcpdump
  9 | 
 10 |   echo "------ INSTALLING LOW LEVEL PERFORAMANCE TOOLS ------"
 11 |   dnf install -y -q perf kernel-devel-$(uname -r) bcc
 12 | 
 13 |   echo "------ INSTALL ANALYSIS TOOLS AND DEPENDENCIES ------"
 14 |   dnf install -y -q python3 python3-pip
 15 |   python3 -m pip install --upgrade pip
 16 |   python3 -m pip install pandas numpy scipy matplotlib sh seaborn plotext
 17 |   git clone https://github.com/brendangregg/FlameGraph.git FlameGraph
 18 | 
 19 |   echo "------ DONE ------"
 20 | }
 21 | install_al2_dependencies () {
 22 |   echo "------ INSTALLING UTILITIES ------"
 23 |   yum install -y -q vim unzip git
 24 | 
 25 |   echo "------ INSTALLING HIGH LEVEL PERFORMANCE TOOLS ------"
 26 |   yum install -y -q sysstat dstat htop hwloc tcpdump
 27 | 
 28 |   echo "------ INSTALLING LOW LEVEL PERFORAMANCE TOOLS ------"
 29 |   amazon-linux-extras enable BCC
 30 |   yum install -y -q perf kernel-devel-$(uname -r) bcc
 31 | 
 32 |   echo "------ INSTALL ANALYSIS TOOLS AND DEPENDENCIES ------"
 33 |   yum install -y -q python3 python3-pip
 34 |   python3 -m pip install --upgrade pip
 35 |   python3 -m pip install pandas numpy scipy matplotlib sh seaborn plotext
 36 |   git clone https://github.com/brendangregg/FlameGraph.git FlameGraph
 37 | 
 38 |   echo "------ DONE ------"
 39 | }
 40 | 
 41 | install_ubuntu2004_dependencies () {
 42 |   echo "------ INSTALLING UTILITIES ------"
 43 |   apt-get update
 44 |   apt-get install -y -q vim unzip git lsb-release grub2-common net-tools
 45 |   ln -s /usr/sbin/grub-mkconfig /usr/sbin/grub2-mkconfig
 46 | 
 47 |   echo "------ INSTALLING HIGH LEVEL PERFORMANCE TOOLS ------"
 48 |   apt-get install -y -q sysstat htop hwloc tcpdump dstat
 49 | 
 50 |   echo "------ INSTALLING LOW LEVEL PERFORAMANCE TOOLS ------"
 51 |   apt-get install -y -q linux-tools-$(uname -r) linux-headers-$(uname -r) linux-modules-extra-$(uname -r) bpfcc-tools
 52 | 
 53 |   echo "------ INSTALL ANALYSIS TOOLS AND DEPENDENCIES ------"
 54 |   apt-get install -y -q python3-dev python3-pip
 55 |   python3 -m pip install --upgrade pip
 56 |   python3 -m pip install pandas numpy scipy matplotlib sh seaborn plotext
 57 |   git clone https://github.com/brendangregg/FlameGraph.git FlameGraph
 58 | 
 59 |   echo "------ DONE ------"
 60 | }
 61 | 
 62 | install_ubuntu2404_dependencies () {
 63 |   echo "------ INSTALLING UTILITIES ------"
 64 |   apt-get update
 65 |   apt-get install -y -q vim unzip git lsb-release grub2-common net-tools
 66 |   ln -s /usr/sbin/grub-mkconfig /usr/sbin/grub2-mkconfig
 67 | 
 68 |   echo "------ INSTALLING HIGH LEVEL PERFORMANCE TOOLS ------"
 69 |   apt-get install -y -q sysstat htop hwloc tcpdump dstat
 70 | 
 71 |   echo "------ INSTALLING LOW LEVEL PERFORAMANCE TOOLS ------"
 72 |   apt-get install -y -q linux-tools-$(uname -r) linux-headers-$(uname -r) linux-modules-extra-$(uname -r) bpfcc-tools
 73 | 
 74 |   echo "------ INSTALL ANALYSIS TOOLS AND DEPENDENCIES ------"
 75 |   apt-get install -y -q python3-dev python3-pip pipx
 76 |   pipx install pandas numpy scipy matplotlib sh seaborn plotext --include-deps
 77 |   pipx ensurepath
 78 |   git clone https://github.com/brendangregg/FlameGraph.git FlameGraph
 79 | 
 80 |   echo "------ DONE ------"
 81 | }
 82 | 
 83 | install_rhel_9_5_dependencies () {
 84 |   echo "------ INSTALLING UTILITIES ------"
 85 |   dnf install -y -q vim unzip git perl-open.noarch
 86 | 
 87 |   echo "------ INSTALLING HIGH LEVEL PERFORMANCE TOOLS ------"
 88 |   dnf install -y -q sysstat htop hwloc tcpdump
 89 | 
 90 |   echo "------ INSTALLING LOW LEVEL PERFORAMANCE TOOLS ------"
 91 |   dnf install -y -q perf kernel-devel-$(uname -r) bcc
 92 | 
 93 |   echo "------ INSTALL ANALYSIS TOOLS AND DEPENDENCIES ------"
 94 |   dnf install -y -q python3 python3-pip
 95 |   python3 -m pip install --upgrade pip
 96 |   python3 -m pip install pandas numpy scipy matplotlib sh seaborn plotext
 97 |   git clone https://github.com/brendangregg/FlameGraph.git FlameGraph
 98 | 
 99 |   echo "------ DONE ------"
100 | }
101 | 
102 | install_sles_15_sp6_dependencies() {
103 |     echo "------ INSTALLING UTILITIES ------"
104 |     zypper --quiet --non-interactive install vim unzip git
105 | 
106 |     echo "------ INSTALLING HIGH LEVEL PERFORMANCE TOOLS ------"
107 |     zypper --quiet --non-interactive install sysstat htop hwloc tcpdump
108 | 
109 |     echo "------ INSTALLING LOW LEVEL PERFORMANCE TOOLS ------"
110 |     # Install perf and related tools
111 |     zypper --quiet --non-interactive install perf
112 |     zypper --quiet --non-interactive install kernel-default-devel
113 |     zypper --quiet --non-interactive install bcc-tools
114 | 
115 |     echo "------ INSTALL ANALYSIS TOOLS AND DEPENDENCIES ------"
116 |     # Install Python and pip
117 |     zypper --quiet --non-interactive install python3 python3-pip
118 |     python3 -m pip install --upgrade pip
119 |     python3 -m pip install pandas numpy scipy matplotlib sh seaborn plotext
120 | 
121 |     # Get FlameGraph tools
122 |     git clone https://github.com/brendangregg/FlameGraph.git FlameGraph
123 | 
124 |     echo "------ DONE ------"
125 | }
126 | 
127 | if [[ $(id -u) -ne 0 ]]; then
128 |   echo "Must run with sudo privileges"
129 |   exit 1
130 | fi
131 | 
132 | os_name=$(cat /etc/os-release | grep "PRETTY_NAME" | awk -F"=" '{print $2}' | tr -d '[="=]' | tr -d [:cntrl:])
133 | 
134 | 
135 | case "$os_name" in
136 |   "Amazon Linux 2023"*)
137 |     install_al2023_dependencies
138 |     ;;
139 |   "Amazon Linux 2")
140 |     install_al2_dependencies
141 |     ;;
142 |   "Ubuntu 20.04"*)
143 |     install_ubuntu2004_dependencies
144 |     ;;
145 |   "Ubuntu 22.04"*)
146 |     install_ubuntu2004_dependencies
147 |     ;;
148 |   "Ubuntu 24.04"*)
149 |     install_ubuntu2404_dependencies
150 |     ;;
151 |   "Red Hat Enterprise Linux 9.5 (Plow)")
152 |     install_rhel_9_5_dependencies
153 |     ;;
154 |   "SUSE Linux Enterprise Server 15 SP6")
155 |     install_sles_15_sp6_dependencies
156 |     ;;
157 |   *)
158 |     echo "$os_name not supported"
159 |     exit 1
160 |     ;;
161 | esac
162 | 
163 | echo "DEPENDENCIES SUCCESSFULLY INSTALLED! -- RUN UTILITIES FROM THIS DIRECTORY"
164 | 


--------------------------------------------------------------------------------
/perfrunbook/utilities/measure_and_plot_basic_sysstat_stats.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import argparse
  5 | import io
  6 | import os
  7 | import subprocess
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | from scipy import stats
 12 | 
 13 | # When calculating aggregate stats, if some are zero, may
 14 | # get a benign divide-by-zero warning from numpy, make it silent.
 15 | np.seterr(divide='ignore')
 16 | pd.options.mode.chained_assignment = None
 17 | 
 18 | 
 19 | def sar(time):
 20 |     """
 21 |     Measure sar into a buffer for parsing
 22 |     """
 23 |     try:
 24 |         env = dict(os.environ, S_TIME_FORMAT="ISO", LC_TIME="ISO")
 25 |         res = subprocess.run(["sar", "-o", "out.dat", "-A", "1", f"{time}"], timeout=time+5, env=env,
 26 |                              check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 27 |         res = subprocess.run(["sar", "-f", "out.dat", "-A", "1"], env=env, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 28 |         os.remove("out.dat")
 29 |         return io.StringIO(res.stdout.decode('utf-8'))
 30 |     except subprocess.CalledProcessError:
 31 |         print("Failed to measure statistics with sar.")
 32 |         print("Please check that sar is installed using install_perfrunbook_dependencies.sh and is in your PATH")
 33 |         return None
 34 | 
 35 | 
 36 | def mpstat(time):
 37 |     """
 38 |     Measure mpstat into a buffer for parsing
 39 |     """
 40 |     try:
 41 |         env = dict(os.environ, S_TIME_FORMAT="ISO", LC_TIME="ISO")
 42 |         res = subprocess.run(["mpstat", "-I", "ALL", "-o", "JSON", "1", f"{time}"], timeout=time+5, env=env,
 43 |                               check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 44 |         return io.StringIO(res.stdout.decode('utf-8'))
 45 |     except subprocess.CalledProcessError:
 46 |         print("Failed to measure statistics with mpstat")
 47 |         print("Please check that sar is installed using install_perfrunbook_dependencies.sh and is in your PATH")
 48 | 
 49 | 
 50 | def plot_terminal(data, title, xlabel, yrange):
 51 |     """
 52 |     Plot data to the terminal using plotext
 53 |     """
 54 |     import plotext as plt
 55 |     x = data.index.tolist()
 56 |     y = data[title].tolist()
 57 | 
 58 |     plt.scatter(x, y)
 59 |     plt.title(title)
 60 |     plt.xlabel(xlabel)
 61 |     plt.ylim(*yrange)
 62 |     plt.plot_size(100, 30)
 63 |     plt.show()
 64 | 
 65 | 
 66 | def calc_stats_and_plot(df, stat, yaxis_range=None):
 67 |     """
 68 |     Function that calculates the common stats and 
 69 |     plots the data.
 70 |     """
 71 |     df['time_delta'] = (df.index - df.index[0]).seconds
 72 |     df = df.set_index('time_delta')
 73 | 
 74 |     if yaxis_range:
 75 |         limit = yaxis_range
 76 |     else:
 77 |         limit = (0, df[stat].max() + 1)
 78 | 
 79 |     # Calculate some meaningful aggregate stats for comparing time-series plots
 80 |     geomean = stats.gmean(df[stat])
 81 |     p50 = stats.scoreatpercentile(df[stat], 50)
 82 |     p90 = stats.scoreatpercentile(df[stat], 90)
 83 |     p99 = stats.scoreatpercentile(df[stat], 99)
 84 |     xtitle = f"gmean:{geomean:>6.2f} p50:{p50:>6.2f} p90:{p90:>6.2f} p99:{p99:>6.2f}"
 85 | 
 86 |     plot_terminal(df, stat, xtitle, limit)
 87 | 
 88 | 
 89 | def parse_sar(sar_parse_class, buf):
 90 |     """
 91 |     Parse SAR output to a pandas dataframe
 92 |     """
 93 |     from sar_parse import parse_start_date
 94 |     line = buf.readline()
 95 |     start_date = parse_start_date(line)
 96 |     if not start_date:
 97 |         print("ERR: header not first line of Sar file, exiting")
 98 |         exit(1)
 99 | 
100 |     parse = sar_parse_class(start_date)
101 |     line = buf.readline()
102 |     df = None
103 |     while(line):
104 |         df = parse.parse_for_header(line, buf, save_parquet=False)
105 |         if (df is not None):
106 |             break
107 |         line = buf.readline()
108 |     
109 |     return df
110 | 
111 | 
112 | def plot_cpu(buf, stat):
113 |     """
114 |     Plot cpu usage data from sar
115 |     """
116 |     from sar_parse import ParseCpuTime
117 |     df = parse_sar(ParseCpuTime, buf)
118 | 
119 |     YAXIS_RANGE = (0, 100)
120 | 
121 |     group = df.groupby('cpu')
122 |     data = group.get_group('all')
123 | 
124 |     calc_stats_and_plot(data, stat, yaxis_range=YAXIS_RANGE)
125 | 
126 | 
127 | def plot_tcp(buf, stat):
128 |     """
129 |     Plot the numer of new connections being recieved over time
130 |     """
131 |     from sar_parse import ParseTcpTime
132 |     df = parse_sar(ParseTcpTime, buf)
133 | 
134 |     calc_stats_and_plot(df, stat)
135 | 
136 | 
137 | def plot_cswitch(buf, stat):
138 |     """
139 |     Plot cpu usage data from sar
140 |     """
141 |     from sar_parse import ParseCSwitchTime
142 |     df = parse_sar(ParseCSwitchTime, buf)
143 | 
144 |     calc_stats_and_plot(df, stat)
145 | 
146 | 
147 | def plot_irq(buf, stat):
148 |     """
149 |     Plot irq per second data from mpstat
150 |     """
151 |     from mpstat_parse import parse_mpstat_json_all_irqs
152 |     import json
153 |     irqs = json.load(buf)
154 | 
155 |     df = parse_mpstat_json_all_irqs(irqs)
156 | 
157 |     calc_stats_and_plot(df, stat)
158 | 
159 | 
160 | def plot_specific_irq(buf, stat):
161 |     """
162 |     Plot a specific IRQ source
163 |     """
164 |     import json
165 |     irqs = json.load(buf)
166 | 
167 |     # IPI0 - rescheduling interrupt
168 |     # IPI1 - Function call interrupt
169 |     # RES - rescheduling interrupt x86
170 |     # CAL - function call interrupt x86
171 |     from mpstat_parse import parse_mpstat_json_single_irq
172 |     df = parse_mpstat_json_single_irq(irqs, stat)
173 | 
174 |     calc_stats_and_plot(df, stat)
175 | 
176 | 
177 | stat_mapping = {
178 |   "cpu-user": (sar, plot_cpu, "usr"),
179 |   "cpu-kernel": (sar, plot_cpu, "sys"),
180 |   "cpu-iowait": (sar, plot_cpu, "iowait"),
181 |   "new-connections": (sar, plot_tcp, "passive"),
182 |   "tcp-in-segments": (sar, plot_tcp, "iseg"),
183 |   "tcp-out-segments": (sar, plot_tcp, "oseg"),
184 |   "cswitch": (sar, plot_cswitch, "cswch_s"),
185 |   "all-irqs": (mpstat, plot_irq, "irq_s"),
186 |   "single-irq": (mpstat, plot_specific_irq, ""),
187 | }
188 | 
189 | if __name__ == "__main__":
190 |     parser = argparse.ArgumentParser()
191 |     parser.add_argument("--stat", default="cpu-user", type=str, choices=["cpu-user", "cpu-kernel", "cpu-iowait", 
192 |                                                                          "new-connections", "tcp-in-segments", "tcp-out-segments",
193 |                                                                          "cswitch","all-irqs","single-irq"])
194 |     parser.add_argument("--irq", type=str, help="Specific IRQ to measure if single-irq chosen for stat")
195 |     parser.add_argument("--time", default=60, type=int, help="How long to measure for in seconds")
196 | 
197 |     args = parser.parse_args()
198 | 
199 |     gather, plot, stat = stat_mapping[args.stat]
200 | 
201 |     if args.stat == "single-irq" and args.irq:
202 |         stat = args.irq
203 |     elif args.stat == "single-irq" and not args.irq:
204 |         print("single-irq selected, need to specify --irq option")
205 |         exit(1)
206 | 
207 |     text = gather(args.time)
208 |     plot(text, stat)
209 | 


--------------------------------------------------------------------------------
/perfrunbook/utilities/mpstat_parse.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | 
 7 | # Pass in a dict s:
 8 | # { date: string in YYYY-MM-DD
 9 | #   time: string in hh:mm:ss
10 | # }
11 | # Pass in a last_date as an np.datetime64 obj or None
12 | # Returns a np.datetime64 object
13 | def parse_time(time, last_date):
14 |     # Get ourselves a date in ISO format from last_date
15 |     date = f"{last_date.tolist().year}-{last_date.tolist().month:02d}-{last_date.tolist().day:02d}"
16 |     d = np.datetime64(f"{date} {time}")
17 |     if last_date:
18 |         while (d - last_date) < np.timedelta64(0, 's'):
19 |             d = d + np.timedelta64(1, 'D')
20 | 
21 |     return d
22 | 
23 | 
24 | def parse_mpstat_json_all_irqs(data):
25 |     """
26 |     Parses IRQs for entire system
27 |     """
28 |     irq_data = data["sysstat"]["hosts"][0]["statistics"]
29 |     date = data["sysstat"]["hosts"][0]["date"]
30 | 
31 |     data = {"time": [],
32 |             "irq_s": []}
33 | 
34 |     last_date = None
35 |     for stats in irq_data:
36 |         timestamp = stats["timestamp"]
37 |         all_irqs = float(stats["sum-interrupts"][0]["intr"])
38 | 
39 |         if last_date:
40 |             date = parse_time(timestamp, last_date)
41 |         else:
42 |             date = np.datetime64(f"{date} {timestamp}")
43 |         last_date = date
44 | 
45 |         data["time"].append(date)
46 |         data["irq_s"].append(all_irqs)
47 | 
48 |     df = pd.DataFrame(data)
49 |     df = df.set_index('time')
50 |     return df
51 | 
52 | 
53 | def parse_mpstat_json_single_irq(data, irq):
54 |     """
55 |     Does the generic parsing and combining
56 |     """
57 |     irq_data = data["sysstat"]["hosts"][0]["statistics"]
58 |     date = data["sysstat"]["hosts"][0]["date"]
59 | 
60 |     data = {"time": []}
61 |     data[irq] = []
62 | 
63 |     last_date = None
64 |     for stats in irq_data:
65 |         timestamp = stats["timestamp"]
66 | 
67 |         single_irq = 0
68 |         for cpus in stats["individual-interrupts"]:
69 |             for irqs in cpus["intr"]:
70 |                 if irqs["name"] == irq:
71 |                     single_irq += int(irqs["value"])
72 |         if last_date:
73 |             date = parse_time(timestamp, last_date)
74 |         else:
75 |             date = np.datetime64(f"{date} {timestamp}")
76 |         last_date = date
77 |         data["time"].append(date)
78 |         data[irq].append(single_irq)
79 |     df = pd.DataFrame(data)
80 |     df = df.set_index('time')
81 |     return df
82 | 


--------------------------------------------------------------------------------
/php-opcache-al2.md:
--------------------------------------------------------------------------------
 1 | # PHP OPcache Installation on Amazon Linux 2 (AL2)
 2 | 
 3 | ### Install PHP
 4 | 
 5 | First run `sudo amazon-linux-extras install -y php8.0` to install PHP 8 from AL2 extras, if not already installed.
 6 | 
 7 | ### Sanity Check
 8 | 
 9 | Verify that OPcache is not already present after installation; stop here if so.
10 | 
11 | Run the following commands to see if "opcache.so" is present and enabled in php.ini.
12 | `php --version` prints a "with Zend OPcache" line on successful load.
13 | 
14 | ```
15 | $ file /usr/lib64/php/modules/opcache.so
16 | /usr/lib64/php/modules/opcache.so: ELF 64-bit LSB shared object      <-- already installed
17 | 
18 | $ php --version
19 | PHP 8.0.30 (cli) (built: Aug 24 2023 20:32:36) ( NTS )
20 | Copyright (c) The PHP Group
21 | Zend Engine v4.0.30, Copyright (c) Zend Technologies
22 |     with Zend OPcache v8.0.30, Copyright (c), by Zend Technologies   <-- already enabled
23 | ```
24 | 
25 | ### Install Dependencies
26 | 
27 | Install PHP dependencies required to build OPcache. This is ideally done by running `sudo yum-builddep php`,
28 | which fails in some configurations due to packaging conflict requiring both `libzip010-compat-devel` and `libzip-devel`.
29 | Run the following as a workaround:
30 | 
31 | ```
32 | sudo yum install apr apr-devel apr-util apr-util-bdb apr-util-devel aspell aspell-devel autoconf automake bzip2-devel cpp cyrus-sasl cyrus-sasl-devel elfutils-devel elfutils-libelf-devel enchant enchant-devel expat-devel freetype-devel gcc gcc-c++ gdbm-devel generic-logos-httpd glib2-devel gmp-devel httpd httpd-devel httpd-filesystem httpd-tools libacl-devel libatomic libattr-devel libcurl-devel libdb-devel libedit-devel libgcrypt-devel libgpg-error-devel libICE libicu-devel libitm libjpeg-turbo-devel libmpc libpng-devel libsanitizer libSM libsodium libsodium-devel libtool libtool-ltdl libtool-ltdl-devel libwebp-devel libX11 libX11-common libX11-devel libXau libXau-devel libxcb libxcb-devel libXext libxml2-devel libXpm libXpm-devel libxslt libxslt-devel libXt libzip-devel lm_sensors-devel m4 mailcap mod_http2 mpfr ncurses-c++-libs ncurses-devel net-snmp net-snmp-agent-libs net-snmp-devel net-snmp-libs oniguruma oniguruma-devel openldap-devel pam-devel perl-devel perl-ExtUtils-Install perl-ExtUtils-MakeMaker perl-ExtUtils-Manifest perl-ExtUtils-ParseXS perl-Test-Harness popt-devel postgresql postgresql-devel pyparsing recode recode-devel rpm-devel sqlite-devel systemd-devel systemtap-sdt-devel t1lib t1lib-devel tcp_wrappers-devel tokyocabinet tokyocabinet-devel unixODBC unixODBC-devel xorg-x11-proto-devel xz-devel
33 | ```
34 | 
35 | ### Build Source RPM
36 | 
37 | ```
38 | cd ~
39 | yumdownloader --source php
40 | rpm -ivh ./php-8.0.30-1.amzn2.src.rpm
41 | sudo yum-builddep php
42 | cd ./rpmbuild/SPECS
43 | rpmbuild -ba php.spec
44 | ```
45 | 
46 | ### Install OPcache
47 | 
48 | ```
49 | cd ~/rpmbuild/BUILD
50 | sudo cp ./php-8.0.30/build-cgi/modules/opcache.so /usr/lib64/php/modules/opcache.so
51 | sudo cp ./php-8.0.30/10-opcache.ini /etc/php.d/10-opcache.ini
52 | ```
53 | 
54 | Verify installation by running `php --version`. Output show now look similar to above examples.
55 | Reboot your instance or restart php-fpm and your http server to use OPcache.
56 | 


--------------------------------------------------------------------------------
/php.md:
--------------------------------------------------------------------------------
 1 | # PHP on Graviton
 2 | 
 3 | PHP is a general-purpose scripting language geared towards web development.
 4 | PHP scripts are executed by an interpreter implemented as a plug-in module
 5 | in web servers, a separate daemon (php-fpm) or a CGI executable (php-cgi).
 6 | 
 7 | PHP 7.4 and later are tested to perform well on Graviton. It works out of
 8 | the box on Ubuntu 22.04 and AL2023, but requires extra steps on AL2.
 9 | 
10 | ### OPcache on Amazon Linux 2 (AL2)
11 | 
12 | OPcache improves PHP performance by storing precompiled script bytecode in shared memory, thereby removing
13 | the need for PHP to load and parse scripts on each request. Installing it can significantly improve
14 | execution time on most workloads. More information about OPcache available in the
15 | [PHP Manual](https://www.php.net/manual/en/book.opcache.php).
16 | 
17 | OPcache is installed by default on Amazon Linux 2023 (AL2023) and later, but not yet available in Amazon Linux 2 (AL2).
18 | See [PHP OPcache Installation on AL2](php-opcache-al2.md) for manual build and install instructions.
19 | 


--------------------------------------------------------------------------------
/rust.md:
--------------------------------------------------------------------------------
 1 | # Rust on Graviton
 2 | 
 3 | Rust is supported on Linux/arm64 systems as a tier1 platform along side x86.
 4 | 
 5 | ### Large-System Extensions (LSE)
 6 | 
 7 | All available Graviton processors (excluding the first and not recommended Graviton1) 
 8 | have support for the Armv8.2 instruction set.  Armv8.2 specification includes the 
 9 | large-system extensions (LSE) introduced in Armv8.1. LSE provides low-cost atomic
10 | operations:
11 | 
12 | LSE improves system throughput for CPU-to-CPU communication, locks, and mutexes.
13 | The improvement can be up to an order of magnitude when using LSE instead of
14 | load/store exclusives. LSE can be enabled in Rust and we've seen cases on 
15 | larger machines where performance is improved by over 3x by setting the `RUSTFLAGS`
16 | environment variable and rebuilding your project.
17 | 
18 | ```
19 | export RUSTFLAGS="-Ctarget-feature=+lse"
20 | cargo build --release
21 | ```
22 | 
23 | If you're running only on Graviton2 or newer hardware you can also enable other
24 | instructions by setting the cpu target such as the example below:
25 | 
26 | ```
27 | export RUSTFLAGS="-Ctarget-cpu=neoverse-n1"
28 | cargo build --release
29 | ```
30 | 
31 | When Rust is configured to use LLVM 12 or newer, target feature
32 | `+outline-atomics` is available.  Outline-atomics produces a binary containing
33 | two versions of the atomic operation following the hardware capabilities.  When
34 | the code executes on a newer hardware such as Graviton2, the processor will
35 | execute LSE instructions; when the code executes on older hardware without LSE
36 | instructions, the processor will execute Armv8.0 atomics instructions.
37 | 
38 | Rust 1.57 (release on December 2, 2021) enables by default outline-atomics
39 | target feature when compiling for arm64-linux with LLVM 12 or newer.  When using
40 | older Rust releases, outline-atomics target feature can be enabled with
41 | ```
42 | export RUSTFLAGS="-Ctarget-feature=+outline-atomics"
43 | ```
44 | 


--------------------------------------------------------------------------------
/sample-code/crc.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include "arm_acle.h"
 3 | 
 4 | uint32_t arm_crc32c(const uint8_t *data, int length, uint32_t prev_crc) {
 5 |     uint32_t crc = ~prev_crc;
 6 | 
 7 |     // Align data if it's not aligned
 8 |     while (((uintptr_t)data & 7) && length > 0) {
 9 |         crc = __crc32cb(crc, *(uint8_t *)data);
10 |         data++;
11 |         length--;
12 |     }
13 | 
14 |     while (length >= 8) {
15 |         crc = __crc32cd(crc, *(uint64_t *)data);
16 |         data += 8;
17 |         length -= 8;
18 |     }
19 | 
20 |     while (length > 0) {
21 |         crc = __crc32cb(crc, *(uint8_t *)data);
22 |         data++;
23 |         length--;
24 |     }
25 | 
26 |     return ~crc;
27 | }
28 | 
29 | uint32_t arm_crc32(const uint8_t *data, int length, uint32_t prev_crc) {
30 |     uint32_t crc = ~prev_crc;
31 | 
32 |     // Align data if it's not aligned
33 |     while (((uintptr_t)data & 7) && length > 0) {
34 |         crc = __crc32b(crc, *(uint8_t *)data);
35 |         data++;
36 |         length--;
37 |     }
38 | 
39 |     while (length >= 8) {
40 |         crc = __crc32d(crc, *(uint64_t *)data);
41 |         data += 8;
42 |         length -= 8;
43 |     }
44 | 
45 |     while (length > 0) {
46 |         crc = __crc32b(crc, *(uint8_t *)data);
47 |         data++;
48 |         length--;
49 |     }
50 | 
51 |     return ~crc;
52 | }
53 | 


--------------------------------------------------------------------------------
/sample-code/hwcaps-test.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <stdio.h>
 3 | #include <sys/auxv.h>
 4 | #include <asm/hwcap.h>
 5 | #include <arm_sve.h>
 6 | 
 7 | #define sizeof_array(a) (sizeof(a) / sizeof((a)[0]))
 8 | 
 9 | uint64_t sum_all(uint32_t *values, int length)
10 | {
11 |     uint64_t sum = 0;
12 |     for (int i = 0; i < length; i++)
13 |         sum += values[i];
14 |     return sum;
15 | }
16 | 
17 | #pragma GCC target("+sve2")
18 | #pragma clang attribute push(__attribute__((target("sve2"))), apply_to = function)
19 | uint64_t sum_all_sve2(uint32_t *values, int length)
20 | {
21 |     svuint64_t sum = svdup_u64(0);
22 |     int i = 0;
23 |     svbool_t predicate = svwhilelt_b32(i, length);
24 |     do {
25 |         svuint32_t a = svld1(predicate, (uint32_t *) &values[i]);
26 |         sum = svadalp_u64_x(predicate, sum, a);
27 |         i += svcntw();
28 |         predicate = svwhilelt_b32(i, length);
29 |     } while (svptest_any(svptrue_b32(), predicate));
30 |     return svaddv_u64(svptrue_b64(), sum);
31 | }
32 | #pragma clang attribute pop
33 | 
34 | void test() {
35 |     uint32_t values[13] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13};
36 | 
37 |     int have_sve = !!(getauxval(AT_HWCAP2) & HWCAP2_SVE2);
38 |     uint64_t sum = 0;
39 |     if (have_sve) {
40 |         sum = sum_all_sve(&values[0], sizeof_array(values));
41 |     } else {
42 |         sum = sum_all(&values[0], sizeof_array(values));
43 |     }
44 | 
45 |     printf("sum: %lu, computed %s SVE2\n", sum, (have_sve) ? "with" : "without");
46 | }
47 | 
48 | int main(int argc, char *argv[])
49 | {
50 |     test();
51 |     return 0;
52 | }
53 | 


--------------------------------------------------------------------------------
/sample-code/hwcaps.c:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <stdio.h>
  3 | #include <sys/auxv.h>
  4 | #include <asm/hwcap.h>
  5 | 
  6 | #define HWCAP1_LIST(XX)         \
  7 |     XX(HWCAP_FP)                \
  8 |     XX(HWCAP_ASIMD)             \
  9 |     XX(HWCAP_EVTSTRM)           \
 10 |     XX(HWCAP_AES)               \
 11 |     XX(HWCAP_PMULL)             \
 12 |     XX(HWCAP_SHA1)              \
 13 |     XX(HWCAP_SHA2)              \
 14 |     XX(HWCAP_CRC32)             \
 15 |     XX(HWCAP_ATOMICS)           \
 16 |     XX(HWCAP_FPHP)              \
 17 |     XX(HWCAP_ASIMDHP)           \
 18 |     XX(HWCAP_CPUID)             \
 19 |     XX(HWCAP_ASIMDRDM)          \
 20 |     XX(HWCAP_JSCVT)             \
 21 |     XX(HWCAP_FCMA)              \
 22 |     XX(HWCAP_LRCPC)             \
 23 |     XX(HWCAP_DCPOP)             \
 24 |     XX(HWCAP_SHA3)              \
 25 |     XX(HWCAP_SM3)               \
 26 |     XX(HWCAP_SM4)               \
 27 |     XX(HWCAP_ASIMDDP)           \
 28 |     XX(HWCAP_SHA512)            \
 29 |     XX(HWCAP_SVE)               \
 30 |     XX(HWCAP_ASIMDFHM)          \
 31 |     XX(HWCAP_DIT)               \
 32 |     XX(HWCAP_USCAT)             \
 33 |     XX(HWCAP_ILRCPC)            \
 34 |     XX(HWCAP_FLAGM)             \
 35 |     XX(HWCAP_SSBS)              \
 36 |     XX(HWCAP_SB)                \
 37 |     XX(HWCAP_PACA)              \
 38 |     XX(HWCAP_PACG)
 39 | 
 40 | #if !defined(HWCAP2_MTE3)
 41 | #define HWCAP2_MTE3 (1UL << 63)
 42 | #endif
 43 | #if !defined(HWCAP2_SME)
 44 | #define HWCAP2_SME (1UL << 63)
 45 | #endif
 46 | #if !defined(HWCAP2_SME_I16I64)
 47 | #define HWCAP2_SME_I16I64 (1UL << 63)
 48 | #endif
 49 | #if !defined(HWCAP2_SME_F64F64)
 50 | #define HWCAP2_SME_F64F64 (1UL << 63)
 51 | #endif
 52 | #if !defined(HWCAP2_SME_I8I32)
 53 | #define HWCAP2_SME_I8I32 (1UL << 63)
 54 | #endif
 55 | #if !defined(HWCAP2_SME_F16F32)
 56 | #define HWCAP2_SME_F16F32 (1UL << 63)
 57 | #endif
 58 | #if !defined(HWCAP2_SME_B16F32)
 59 | #define HWCAP2_SME_B16F32 (1UL << 63)
 60 | #endif
 61 | #if !defined(HWCAP2_SME_F32F32)
 62 | #define HWCAP2_SME_F32F32 (1UL << 63)
 63 | #endif
 64 | #if !defined(HWCAP2_SME_FA64)
 65 | #define HWCAP2_SME_FA64 (1UL << 63)
 66 | #endif
 67 | #if !defined(HWCAP2_WFXT)
 68 | #define HWCAP2_WFXT (1UL << 63)
 69 | #endif
 70 | #if !defined(HWCAP2_EBF16)
 71 | #define HWCAP2_EBF16 (1UL << 63)
 72 | #endif
 73 | #if !defined(HWCAP2_SVE_EBF16)
 74 | #define HWCAP2_SVE_EBF16 (1UL << 63)
 75 | #endif
 76 | #if !defined(HWCAP2_CSSC)
 77 | #define HWCAP2_CSSC (1UL << 63)
 78 | #endif
 79 | #if !defined(HWCAP2_RPRFM)
 80 | #define HWCAP2_RPRFM (1UL << 63)
 81 | #endif
 82 | #if !defined(HWCAP2_SVE2P1)
 83 | #define HWCAP2_SVE2P1 (1UL << 63)
 84 | #endif
 85 | #if !defined(HWCAP2_SME2)
 86 | #define HWCAP2_SME2 (1UL << 63)
 87 | #endif
 88 | #if !defined(HWCAP2_SME2P1)
 89 | #define HWCAP2_SME2P1 (1UL << 63)
 90 | #endif
 91 | #if !defined(HWCAP2_SME_I16I32)
 92 | #define HWCAP2_SME_I16I32 (1UL << 63)
 93 | #endif
 94 | #if !defined(HWCAP2_SME_BI32I32)
 95 | #define HWCAP2_SME_BI32I32 (1UL << 63)
 96 | #endif
 97 | #if !defined(HWCAP2_SME_B16B16)
 98 | #define HWCAP2_SME_B16B16 (1UL << 63)
 99 | #endif
100 | #if !defined(HWCAP2_SME_F16F16)
101 | #define HWCAP2_SME_F16F16 (1UL << 63)
102 | #endif
103 | #if !defined(HWCAP2_MOPS)
104 | #define HWCAP2_MOPS (1UL << 63)
105 | #endif
106 | #if !defined(HWCAP2_HBC)
107 | #define HWCAP2_HBC (1UL << 63)
108 | #endif
109 | #if !defined(HWCAP2_SVE_B16B16)
110 | #define HWCAP2_SVE_B16B16 (1UL << 63)
111 | #endif
112 | #if !defined(HWCAP2_LRCPC3)
113 | #define HWCAP2_LRCPC3 (1UL << 63)
114 | #endif
115 | #if !defined(HWCAP2_LSE128)
116 | #define HWCAP2_LSE128 (1UL << 63)
117 | #endif
118 | 
119 | #define HWCAP2_LIST(XX)         \
120 |     XX(HWCAP2_DCPODP)           \
121 |     XX(HWCAP2_SVE2)             \
122 |     XX(HWCAP2_SVEAES)           \
123 |     XX(HWCAP2_SVEPMULL)         \
124 |     XX(HWCAP2_SVEBITPERM)       \
125 |     XX(HWCAP2_SVESHA3)          \
126 |     XX(HWCAP2_SVESM4)           \
127 |     XX(HWCAP2_FLAGM2)           \
128 |     XX(HWCAP2_FRINT)            \
129 |     XX(HWCAP2_SVEI8MM)          \
130 |     XX(HWCAP2_SVEF32MM)         \
131 |     XX(HWCAP2_SVEF64MM)         \
132 |     XX(HWCAP2_SVEBF16)          \
133 |     XX(HWCAP2_I8MM)             \
134 |     XX(HWCAP2_BF16)             \
135 |     XX(HWCAP2_DGH)              \
136 |     XX(HWCAP2_RNG)              \
137 |     XX(HWCAP2_BTI)              \
138 |     XX(HWCAP2_MTE)              \
139 |     XX(HWCAP2_ECV)              \
140 |     XX(HWCAP2_AFP)              \
141 |     XX(HWCAP2_RPRES)            \
142 |     XX(HWCAP2_MTE3)             \
143 |     XX(HWCAP2_SME)              \
144 |     XX(HWCAP2_SME_I16I64)       \
145 |     XX(HWCAP2_SME_F64F64)       \
146 |     XX(HWCAP2_SME_I8I32)        \
147 |     XX(HWCAP2_SME_F16F32)       \
148 |     XX(HWCAP2_SME_B16F32)       \
149 |     XX(HWCAP2_SME_F32F32)       \
150 |     XX(HWCAP2_SME_FA64)         \
151 |     XX(HWCAP2_WFXT)             \
152 |     XX(HWCAP2_EBF16)            \
153 |     XX(HWCAP2_SVE_EBF16)        \
154 |     XX(HWCAP2_CSSC)             \
155 |     XX(HWCAP2_RPRFM)            \
156 |     XX(HWCAP2_SVE2P1)           \
157 |     XX(HWCAP2_SME2)             \
158 |     XX(HWCAP2_SME2P1)           \
159 |     XX(HWCAP2_SME_I16I32)       \
160 |     XX(HWCAP2_SME_BI32I32)      \
161 |     XX(HWCAP2_SME_B16B16)       \
162 |     XX(HWCAP2_SME_F16F16)       \
163 |     XX(HWCAP2_MOPS)             \
164 |     XX(HWCAP2_HBC)              \
165 |     XX(HWCAP2_SVE_B16B16)       \
166 |     XX(HWCAP2_LRCPC3)           \
167 |     XX(HWCAP2_LSE128)
168 | 
169 | void aarch64_get_cpu_flags()
170 | {
171 |     unsigned long hwcap = getauxval(AT_HWCAP);
172 |     unsigned long hwcap2 = getauxval(AT_HWCAP2);
173 | 
174 | #define XX(cap) \
175 |     printf( "%-20s %s\n", #cap, (hwcap & cap) ? "*" : " " );
176 |     HWCAP1_LIST(XX)
177 | #undef XX
178 | 
179 | #define XX(cap) \
180 |     printf( "%-20s %s\n", #cap, (hwcap2 & cap) ? "*" : " " );
181 |     HWCAP2_LIST(XX)
182 | #undef XX
183 | }
184 | 
185 | int main(int argc, char *argv[])
186 | {
187 |     aarch64_get_cpu_flags();
188 |     return 0;
189 | }
190 | 


--------------------------------------------------------------------------------
/sample-code/lambda_region_finder.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -e
 2 | 
 3 | # Graviton2 Function Finder
 4 | # Identify Lambda functions with Graviton2 compatible and not-compatible runtimes versions.  Looks in all regions where Graviton2 Lambda is currently available.
 5 | # Lambda runtimes support for Graviton2 docs: https://docs.aws.amazon.com/lambda/latest/dg/lambda-runtimes.html
 6 | 
 7 | supported_regions=(us-east-1 us-east-2 us-west-2 eu-west-1 eu-west-2 eu-central-1 ap-southeast-1 ap-southeast-2 ap-northeast-1 ap-south-1)
 8 | supported_runtimes=(python3.8 python3.9 nodejs12.x nodejs14.x dotnetcore3.1 ruby2.7 java8.al2 java11 provided.al2)
 9 | unsupported_runtimes=(python3.6 python3.7 python2.7 nodejs10.x dotnetcore2.1 ruby2.5 java8 go1.x provided)
10 | 
11 | echo "Graviton2 Function Support Finder"
12 | 
13 | for region in "${supported_regions[@]}"
14 |         do
15 | 	echo "  "
16 | 	echo "Region: [${region}] - Functions WITH Graviton Compatible Runtimes"
17 | 	echo "  "
18 | 
19 | 	for runtime in "${supported_runtimes[@]}"
20 |         do
21 | 		aws lambda list-functions --region "${region}" --output text --query "Functions[?Runtime=='${runtime}'].{ARN:FunctionArn, Runtime:Runtime}"
22 | 
23 | 	done
24 | 
25 |         # include the container image functions
26 | 	aws lambda list-functions --region "${region}" --output text --query "Functions[?PackageType=='Image'].{ARN:FunctionArn, PackageType:'container-image'}"
27 | 
28 | 
29 | 	echo "  "
30 | 	echo "Region: [${region}] - Functions with Runtimes that are NOT Compatible with Graviton2. Require a Runtime version update."
31 | 	echo "  "
32 | 
33 | 	for runtime in "${unsupported_runtimes[@]}"
34 |         do
35 | 		aws lambda list-functions --region "${region}" --output text --query "Functions[?Runtime=='${runtime}'].{ARN:FunctionArn, Runtime:Runtime}"
36 | 	done
37 | done
38 | echo "finished"


--------------------------------------------------------------------------------
/software/ChromeAndPuppeteer.md:
--------------------------------------------------------------------------------
  1 | # Headless website testing with Chrome and Puppeteer on Graviton.
  2 | 
  3 | Chrome is a popular reference browser for web site unit testing on EC2 x86 instances. In the samer manner it can be used on EC2 Graviton instances.
  4 | It is open source in its Chromium incarnation, supports 'headless' mode and can simulate user actions via Javascript.
  5 | The Chrome web site [mentions Puppeteer](https://developer.chrome.com/blog/headless-chrome/#using-programmatically-node) and it is used here to show a specific
  6 | example of headless website testing on Graviton.
  7 | “[Puppeteer](https://pptr.dev/) is a Node.js library which provides a high-level API to control Chrome/Chromium over the DevTools Protocol. 
  8 | Puppeteer runs in headless mode by default, but can be configured to run in full ("headful") Chrome/Chromium."
  9 | It can serve as a replacement for the previously very popular Phantomjs.
 10 | “PhantomJS (phantomjs.org) is a headless WebKit scriptable with JavaScript. The latest stable release is version 2.1.
 11 | Important: PhantomJS development is suspended until further notice (see #15344 for more details).“
 12 | The APIs are different, so code targetting PhantomJS has to be rewritten when moving to Puppeteer (see Appendix).
 13 | Puppeteer is open source and has 466 contributors and 364k users and thus is likely to be supported for some time.
 14 | 
 15 | ### Get a recent version of NodeJS.
 16 | 
 17 | Ubuntu-22:
 18 | ```
 19 | sudo apt-get update
 20 | sudo apt-get install -y ca-certificates curl gnupg
 21 | sudo mkdir -p /etc/apt/keyrings
 22 | curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | sudo gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg
 23 | NODE_MAJOR=20
 24 | echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_$NODE_MAJOR.x nodistro main" | sudo tee /etc/apt/sources.list.d/nodesource.list
 25 | sudo apt-get update
 26 | sudo apt-get install nodejs -y
 27 | ```
 28 | AL2023:
 29 | ```
 30 | sudo yum install https://rpm.nodesource.com/pub_20.x/nodistro/repo/nodesource-release-nodistro-1.noarch.rpm -y
 31 | sudo yum install nodejs -y --setopt=nodesource-nodejs.module_hotfixes=1
 32 | ```
 33 | [https://github.com/nodesource/distributions/tree/master]
 34 | 
 35 | ### Install Puppeteer.
 36 | ```
 37 | npm i puppeteer@21.1.0
 38 | ```
 39 | Puppeteer packages an x86 version of Chrome which needs to be replaced with an aarch64 version.
 40 | 
 41 | ### Install aarch64 version of Chrome.
 42 | 
 43 | Ubuntu-22:
 44 | ```
 45 | sudo apt update
 46 | sudo apt install chromium-browser chromium-codecs-ffmpeg
 47 | ```
 48 | AL2023:
 49 | ```
 50 | QTVER=5.15.9-2
 51 | CHROMEVER=116.0.5845.96
 52 | 
 53 | sudo dnf -y install \
 54 |     https://kojipkgs.fedoraproject.org//packages/minizip/2.8.9/2.el8/aarch64/minizip-2.8.9-2.el8.aarch64.rpm \
 55 |     https://download-ib01.fedoraproject.org/pub/epel/9/Everything/aarch64/Packages/n/nss-mdns-0.15.1-3.1.el9.aarch64.rpm \
 56 |     http://mirror.stream.centos.org/9-stream/AppStream/aarch64/os/Packages/gstreamer1-1.18.4-4.el9.aarch64.rpm \
 57 |     http://mirror.stream.centos.org/9-stream/AppStream/aarch64/os/Packages/libcanberra-0.30-26.el9.aarch64.rpm \
 58 |     http://mirror.stream.centos.org/9-stream/AppStream/aarch64/os/Packages/libcanberra-gtk3-0.30-26.el9.aarch64.rpm \
 59 |     http://mirror.stream.centos.org/9-stream/AppStream/aarch64/os/Packages/sound-theme-freedesktop-0.8-17.el9.noarch.rpm \
 60 |     http://mirror.stream.centos.org/9-stream/AppStream/aarch64/os/Packages/qt5-qtbase-$QTVER.el9.aarch64.rpm \
 61 |     http://mirror.stream.centos.org/9-stream/AppStream/aarch64/os/Packages/qt5-qtbase-common-$QTVER.el9.noarch.rpm \
 62 |     http://mirror.stream.centos.org/9-stream/AppStream/aarch64/os/Packages/qt5-qtbase-gui-$QTVER.el9.aarch64.rpm \
 63 |     http://mirror.stream.centos.org/9-stream/AppStream/aarch64/os/Packages/glx-utils-8.4.0-12.20210504git0f9e7d9.el9.aarch64.rpm \
 64 |     http://mirror.stream.centos.org/9-stream/AppStream/aarch64/os/Packages/pipewire-libs-0.3.47-2.el9.aarch64.rpm \
 65 |     http://mirror.stream.centos.org/9-stream/AppStream/aarch64/os/Packages/fdk-aac-free-2.0.0-8.el9.aarch64.rpm \
 66 |     http://mirror.stream.centos.org/9-stream/AppStream/aarch64/os/Packages/libldac-2.0.2.3-10.el9.aarch64.rpm \
 67 |     https://kojipkgs.fedoraproject.org//packages/chromium/$CHROMEVER/1.el8/aarch64/chromium-$CHROMEVER-1.el8.aarch64.rpm \
 68 |     https://kojipkgs.fedoraproject.org//packages/chromium/$CHROMEVER/1.el8/aarch64/chromium-common-$CHROMEVER-1.el8.aarch64.rpm \
 69 |     https://kojipkgs.fedoraproject.org//packages/chromium/$CHROMEVER/1.el8/aarch64/chromium-headless-$CHROMEVER-1.el8.aarch64.rpm \
 70 |     https://kojipkgs.fedoraproject.org//packages/chromium/$CHROMEVER/1.el8/aarch64/chromedriver-$CHROMEVER-1.el8.aarch64.rpm
 71 | ```
 72 | As QT version above will be updated, and the old one become unavailable at some point, the version variable will need to be changed accordingly.
 73 | If this is the case, check: https://mirror.stream.centos.org/9-stream/AppStream/aarch64/os/Packages/ to see which version is available.
 74 | ```
 75 | rm .cache/puppeteer/chrome/linux-116.0.5845.96/chrome-linux64/chrome
 76 | ln -s /usr/bin/chromium-browser .cache/puppeteer/chrome/linux-116.0.5845.96/chrome-linux64/chrome
 77 | ```
 78 | This brute force install works because:
 79 | ```
 80 | /snap/bin/chromium --version
 81 | Chromium 116.0.5845.96 snap
 82 | ```
 83 | Puppeteer needs a particular version of chromium, in this case (Puppeteer-21.1.0), it uses Chromium 116.0.5845.96.
 84 | If a different version of Puppeteer is needed, the directory `~/.cache/puppeteer/chrome` indicates which version of Chromium it is targeting.
 85 | This version must be assigned to the `CHROMEVER` variable above.
 86 | If the required version of Chromium is not available at `https://kojipkgs.fedoraproject.org/packages/chromium/` 
 87 | then proceed to `https://chromium.googlesource.com/chromium/src/+/main/docs/linux/build_instructions.md`.
 88 | 
 89 | ### Code Examples
 90 | 
 91 | Example code for Puppeteer can be found at https://github.com/puppeteer/examples
 92 | 
 93 | ### Virtual Framebuffer Xserver
 94 | 
 95 | Some code examples, such as puppeteer/examples/oopif.js, may need a 'headful' chrome and thus an Xserver.
 96 | A virtual framebuffer Xserver can be used for that.
 97 | 
 98 | Ubuntu-22.04: ```sudo apt install xvfb```
 99 | AL2023: ```sudo yum install Xvfb```
100 | ```
101 | Xvfb &
102 | export DISPLAY=:0
103 | ```
104 | When Chrome is now invoked in headful mode, it has an Xserver to render to.
105 | 
106 | This can be tested with:
107 | ```
108 | node puppeteer/examples/oopif.js
109 | ```
110 | The oopif.js example invokes chrome in headful mode.
111 | 
112 | 
113 | ## Appendix.
114 | 
115 | ### Code example to show the difference in API between PhantomJS and Puppeteer.
116 | 
117 | 
118 | Puppeteer Screenshot:
119 | ```
120 | 'use strict';
121 | 
122 | const puppeteer = require('puppeteer');
123 | 
124 | (async () => {
125 |   const browser = await puppeteer.launch();
126 |   const page = await browser.newPage();
127 |   await page.goto('https://www.google.com', {waitUntil: 'load', timeout: 1000});
128 |   await page.screenshot({path: 'google.png'});
129 |   await browser.close();
130 | })();
131 | ```
132 | The same with PhantomJS:
133 | ```
134 | var page = require('webpage').create();
135 | page.open('http://www.google.com', function() {
136 |     setTimeout(function() {
137 |         page.render('google.png');
138 |         phantom.exit();
139 |     }, 200);
140 | });
141 | ```
142 | 
143 | 


--------------------------------------------------------------------------------
/software/librdkafka.md:
--------------------------------------------------------------------------------
  1 | Building librdkafka for AWS Graviton (including Python module)
  2 | ==============================================================
  3 | 
  4 | [librdkafka](https://github.com/confluentinc/librdkafka) is a C library
  5 | implementation of the Apache Kafka protocol, providing Producer, Consumer and
  6 | Admin clients. It was designed with message delivery reliability and high
  7 | performance in mind.
  8 | 
  9 | ## Table of contents
 10 | 
 11 | <!-- no toc -->
 12 | - [Amazon Linux 2](#amazon-linux-2)
 13 | - [Red Hat Enterprise Linux 8 (and compatible EL 8 distributions e.g. Rocky Linux)](#red-hat-enterprise-linux-8-and-compatible-el-8-distributions-eg-rocky-linux)
 14 | - [Ubuntu 20.04 (Focal) and 22.04 (Jammy)](#ubuntu-2004-focal-and-2204-jammy)
 15 | 
 16 | ## Amazon Linux 2
 17 | 
 18 | First, install the necessary dependencies, add the current user to the `mock` group, and log out.
 19 | 
 20 | ```sh
 21 | sudo amazon-linux-extras install -y mock2
 22 | sudo yum -y install git
 23 | sudo usermod -G mock `id -un`
 24 | logout
 25 | ```
 26 | 
 27 | Next, log back into the instance, and run:
 28 | 
 29 | ```sh
 30 | export LIBRDKAFKA_VERSION=2.3.0   #The minimum version required is 2.3.0
 31 | git clone -b v${LIBRDKAFKA_VERSION} https://github.com/confluentinc/librdkafka 
 32 | cd librdkafka/packaging/rpm
 33 | MOCK_CONFIG=/etc/mock/amazonlinux-2-aarch64.cfg make
 34 | # Packages will be placed in ./pkgs-${LIBRDKAFKA_VERSION}-1-/etc/mock/amazonlinux-2-aarch64.cfg/
 35 | cd ./pkgs-${LIBRDKAFKA_VERSION}-1-/etc/mock/amazonlinux-2-aarch64.cfg/
 36 | sudo yum -y install *.aarch64.rpm
 37 | ```
 38 | 
 39 | Once you have installed the RPM packages, you can build and install the Python module:
 40 | 
 41 | ```sh
 42 | sudo yum -y install gcc python3-devel
 43 | python3 -m pip install --user --no-binary confluent-kafka confluent-kafka
 44 | ```
 45 | 
 46 | ## Red Hat Enterprise Linux 8 (and compatible EL 8 distributions e.g. Rocky Linux)
 47 | 
 48 | First, install the necessary dependencies, add the current user to the `mock` group, and log out.
 49 | 
 50 | ```
 51 | sudo dnf config-manager --set-enabled powertools
 52 | sudo dnf install -y git make epel-release
 53 | sudo dnf install -y mock
 54 | sudo usermod -G mock `id -un`
 55 | logout
 56 | ```
 57 | 
 58 | Next, log back into the instance, and run:
 59 | 
 60 | ```sh
 61 | export LIBRDKAFKA_VERSION=2.0.2 # Or whichever version you need. We tested with 2.0.2.
 62 | git clone -b v${LIBRDKAFKA_VERSION} https://github.com/confluentinc/librdkafka 
 63 | cd librdkafka/packaging/rpm
 64 | make
 65 | # Packages will be placed in ./pkgs-${LIBRDKAFKA_VERSION}-1-/etc/mock/amazonlinux-2-aarch64.cfg/
 66 | cd ./pkgs-2.0.2-1-default
 67 | sudo dnf -y install *.aarch64.rpm
 68 | ```
 69 | 
 70 | Once you have installed the RPM packages, you can build and install the Python module:
 71 | 
 72 | ```sh
 73 | sudo dnf -y install gcc python3-devel
 74 | python3 -m pip install --user --no-binary confluent-kafka confluent-kafka
 75 | ```
 76 | 
 77 | ## Ubuntu 20.04 (Focal) and 22.04 (Jammy)
 78 | 
 79 | ```sh
 80 | export LIBRDKAFKA_VERSION=2.0.2 # Or whichever version you need. We tested with 2.0.2.
 81 | export EMAIL=builder@example.com
 82 | sudo apt-get update
 83 | sudo apt-get install -y git-buildpackage debhelper zlib1g-dev libssl-dev libsasl2-dev liblz4-dev
 84 | git clone https://github.com/confluentinc/librdkafka 
 85 | cd librdkafka
 86 | git checkout -b debian v${LIBRDKAFKA_VERSION}
 87 | dch --newversion ${LIBRDKAFKA_VERSION}-1 "Release version ${LIBRDKAFKA_VERSION}" --urgency low
 88 | dch --release --distribution unstable ""
 89 | git commit -am "Tag Debian release ${LIBRDKAFKA_VERSION}"
 90 | mkdir ../build-area
 91 | git archive --format=tgz --output=../build-area/librdkafka_${LIBRDKAFKA_VERSION}.orig.tar.gz HEAD
 92 | gbp buildpackage -us -uc --git-verbose --git-builder="debuild --set-envvar=VERSION=${LIBRDKAFKA_VERSION} --set-envvar=SKIP_TESTS=y -i -I" --git-ignore-new
 93 | ```
 94 | 
 95 | This will yield a set of Debian packages in the build area. To install them:
 96 | 
 97 | ```sh
 98 | sudo dpkg -i ../build-area/*_arm64.deb
 99 | ```
100 | 
101 | Once you have installed the packages, you can build and install the Python module:
102 | 
103 | ```sh
104 | python3 -m pip install --user --no-binary confluent-kafka confluent-kafka
105 | ```
106 | 
107 | ## Example Dockerfile for Python module
108 | 
109 | The following `Dockerfile` can be used to build a container image based on
110 | Debian Bullseye containing the Python module. It produces a minimized image via
111 | a multi-stage build.
112 | 
113 | ```
114 | FROM public.ecr.aws/docker/library/python:3.10.10-slim-bullseye AS build
115 | 
116 | ARG LIBRDKAFKA_VERSION=2.0.2
117 | ENV EMAIL=nobody@build.example.com
118 | 
119 | WORKDIR /build
120 | RUN apt-get update && \
121 |     apt-get install -y git-buildpackage debhelper zlib1g-dev libssl-dev libsasl2-dev liblz4-dev python3-dev && \
122 |     git clone https://github.com/confluentinc/librdkafka && \
123 |     cd librdkafka && \
124 |     git checkout -b debian v${LIBRDKAFKA_VERSION} && \
125 |     dch --newversion ${LIBRDKAFKA_VERSION}-1 "Release version ${LIBRDKAFKA_VERSION}" --urgency low && \
126 |     dch --release --distribution unstable "" && \
127 |     git commit -am "Tag Debian release ${LIBRDKAFKA_VERSION}" && \
128 |     mkdir ../build-area && \
129 |     git archive --format=tgz --output=../build-area/librdkafka_${LIBRDKAFKA_VERSION}.orig.tar.gz HEAD && \
130 |     gbp buildpackage -us -uc --git-verbose --git-builder="debuild --set-envvar=VERSION=${LIBRDKAFKA_VERSION} --set-envvar=SKIP_TESTS=y -i -I" --git-ignore-new && \
131 |     apt-get -y install ../build-area/*.deb && \
132 |     python3 -m pip install --no-binary confluent-kafka confluent-kafka
133 | 
134 | 
135 | FROM public.ecr.aws/docker/library/python:3.10.10-slim-bullseye
136 | ARG LIBRDKAFKA_VERSION=2.0.2
137 | COPY --from=build /build/build-area/*.deb /tmp/
138 | RUN apt-get update && apt-get -y install /tmp/*.deb && apt-get clean && rm -rf /var/cache/apt
139 | COPY --from=build /usr/local/lib/python3.10/site-packages/confluent_kafka-${LIBRDKAFKA_VERSION}-py3.10.egg-info \
140 |                   /usr/local/lib/python3.10/site-packages/confluent_kafka-${LIBRDKAFKA_VERSION}-py3.10.egg-info
141 | COPY --from=build /usr/local/lib/python3.10/site-packages/confluent_kafka/ \
142 |                   /usr/local/lib/python3.10/site-packages/confluent_kafka/
143 | ```
144 | 


--------------------------------------------------------------------------------