├── texture_memory ├── README.md ├── forecast.txt ├── Project.toml └── test_texturemem.jl ├── .gitignore ├── GPU_ODE_Julia ├── .JuliaFormatter.toml ├── Project.toml ├── bin │ ├── ode_problems │ │ ├── linear.jl │ │ ├── multistate.jl │ │ ├── lorenz.jl │ │ ├── multisite2.jl │ │ ├── pleiades.jl │ │ └── pollu.jl │ └── catalyst_models │ │ ├── multistate.net │ │ └── multisite2.net ├── src │ └── GPU_ODE_Julia.jl ├── sde_examples │ ├── bench_cpu.jl │ ├── bench_gpu.jl │ └── bench_crn_model.jl ├── bench_cpu.jl ├── bench_lorenz_gpu.jl ├── bench_ensemblegpuarray.jl └── bench_multi_device.jl ├── paper_artifacts ├── figures │ ├── CPU_Lorenz.png │ ├── CPU_SDE_MTK.png │ ├── CPU_SDE_Linear.png │ ├── Lorenz_adaptive.png │ ├── Lorenz_unadaptive.png │ ├── CPU_Lorenz_adaptive.png │ ├── CPU_Lorenz_unadaptive.png │ └── Multi_GPU_unadaptive.png └── data │ ├── Julia │ ├── SDE │ │ ├── CRN │ │ │ └── Julia_times_unadaptive.txt │ │ └── Julia_times_unadaptive.txt │ ├── CPU │ │ ├── SDE │ │ │ ├── CRN │ │ │ │ └── Julia_times_unadaptive.txt │ │ │ └── times_unadaptive.txt │ │ ├── times_adaptive.txt │ │ └── times_unadaptive.txt │ ├── devices │ │ ├── oneAPI │ │ │ ├── Julia_times_adaptive.txt │ │ │ └── Julia_times_unadaptive.txt │ │ ├── CUDA │ │ │ ├── Julia_times_adaptive.txt │ │ │ └── Julia_times_unadaptive.txt │ │ ├── Metal │ │ │ ├── Julia_times_adaptive.txt │ │ │ └── Julia_times_unadaptive.txt │ │ └── AMDGPU │ │ │ ├── Julia_times_adaptive.txt │ │ │ └── Julia_times_unadaptive.txt │ └── EnsembleGPUArray │ │ ├── Julia_EnGPUArray_times_adaptive.txt │ │ └── Julia_EnGPUArray_times_unadaptive.txt │ ├── RTX_5000 │ ├── CPP │ │ ├── MPGOS_times_adaptive.txt │ │ └── MPGOS_times_unadaptive.txt │ ├── Julia │ │ ├── Julia_times_adaptive.txt │ │ └── Julia_times_unadaptive.txt │ ├── JAX │ │ ├── Jax_times_adaptive.txt │ │ └── Jax_times_unadaptive.txt │ └── PyTorch │ │ └── Torch_times_unadaptive.txt │ └── Tesla_V100 │ ├── CPP │ ├── MPGOS_times_adaptive.txt │ └── MPGOS_times_unadaptive.txt │ ├── Julia │ ├── Julia_times_unadaptive.txt │ └── Julia_times_adaptive.txt │ ├── JAX │ ├── Jax_times_unadaptive.txt │ └── Jax_times_adaptive.txt │ └── PyTorch │ └── Torch_times_unadaptive.txt ├── runner_scripts ├── gpu │ ├── run_ode_julia.sh │ ├── run_ode_pytorch.sh │ ├── run_ode_jax.sh │ ├── run_egarray_julia.sh │ ├── run_sde_julia.sh │ ├── run_ode_mult_device.sh │ ├── run_sde_crn.sh │ └── run_ode_cpp.sh ├── cpu │ ├── run_ode_julia.sh │ └── run_sde_julia.sh └── plot │ ├── plot_sde_comp.jl │ ├── plot_mult_gpu.jl │ ├── plot_sde_crn.jl │ ├── plot_ode_comp.jl │ └── plot_cpu_comp.jl ├── test_DiffEqGPU.jl ├── Project.toml ├── GPU_ODE_MPGOS ├── makefile ├── ProfileSpec.sh ├── SourceCodes │ ├── SingleSystem_PerThread_EventHandling.cuh │ ├── SingleSystem_PerThread_DenseOutput.cuh │ ├── MPGOS_Overloaded_MathFunction.cuh │ ├── SingleSystem_PerThread_ExplicitRungeKutta_ErrorControllers.cuh │ ├── SingleSystem_PerThread_ExplicitRungeKutta_Steppers.cuh │ ├── CoupledSystems_PerBlock_EventHandling.cuh │ ├── SingleSystem_PerThread_Solver.cuh │ ├── CoupledSystems_PerBlock_DenseOutput.cuh │ └── TMP.cuh ├── Lorenz_SystemDefinition.cuh └── Lorenz.cu ├── MPI ├── mpi_test.sh └── gpu_ode_mpi.jl ├── GPU_ODE_JAX ├── requirements.txt ├── environment.yml └── bench_diffrax.py ├── LICENSE ├── GPU_ODE_PyTorch ├── requirements.txt ├── bench_torchdiffeq.py └── environment.yml ├── run_benchmark.sh └── README.md /texture_memory/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | plots 2 | *.exe 3 | /data -------------------------------------------------------------------------------- /GPU_ODE_Julia/.JuliaFormatter.toml: -------------------------------------------------------------------------------- 1 | style = "sciml" -------------------------------------------------------------------------------- /texture_memory/forecast.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utkarsh530/GPUODEBenchmarks/HEAD/texture_memory/forecast.txt -------------------------------------------------------------------------------- /paper_artifacts/figures/CPU_Lorenz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utkarsh530/GPUODEBenchmarks/HEAD/paper_artifacts/figures/CPU_Lorenz.png -------------------------------------------------------------------------------- /paper_artifacts/figures/CPU_SDE_MTK.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utkarsh530/GPUODEBenchmarks/HEAD/paper_artifacts/figures/CPU_SDE_MTK.png -------------------------------------------------------------------------------- /paper_artifacts/figures/CPU_SDE_Linear.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utkarsh530/GPUODEBenchmarks/HEAD/paper_artifacts/figures/CPU_SDE_Linear.png -------------------------------------------------------------------------------- /paper_artifacts/figures/Lorenz_adaptive.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utkarsh530/GPUODEBenchmarks/HEAD/paper_artifacts/figures/Lorenz_adaptive.png -------------------------------------------------------------------------------- /paper_artifacts/figures/Lorenz_unadaptive.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utkarsh530/GPUODEBenchmarks/HEAD/paper_artifacts/figures/Lorenz_unadaptive.png -------------------------------------------------------------------------------- /paper_artifacts/figures/CPU_Lorenz_adaptive.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utkarsh530/GPUODEBenchmarks/HEAD/paper_artifacts/figures/CPU_Lorenz_adaptive.png -------------------------------------------------------------------------------- /paper_artifacts/figures/CPU_Lorenz_unadaptive.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utkarsh530/GPUODEBenchmarks/HEAD/paper_artifacts/figures/CPU_Lorenz_unadaptive.png -------------------------------------------------------------------------------- /paper_artifacts/figures/Multi_GPU_unadaptive.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/utkarsh530/GPUODEBenchmarks/HEAD/paper_artifacts/figures/Multi_GPU_unadaptive.png -------------------------------------------------------------------------------- /paper_artifacts/data/Julia/SDE/CRN/Julia_times_unadaptive.txt: -------------------------------------------------------------------------------- 1 | 6720 244.754027 2 | 26880 620.783003 3 | 107520 2042.824231 4 | 430080 7484.933462 5 | 1720320 29404.353759 6 | -------------------------------------------------------------------------------- /paper_artifacts/data/Julia/CPU/SDE/CRN/Julia_times_unadaptive.txt: -------------------------------------------------------------------------------- 1 | 6720 921.966429 2 | 26880 1944.847764 3 | 107520 6746.065749 4 | 430080 27894.174424 5 | 1720320 105755.144901 -------------------------------------------------------------------------------- /paper_artifacts/data/Julia/CPU/times_adaptive.txt: -------------------------------------------------------------------------------- 1 | 8 0.032243 2 | 32 0.048715 3 | 128 0.107526 4 | 512 0.326734 5 | 2048 1.155272 6 | 8192 4.821152 7 | 32768 21.28074 8 | 131072 98.841839 9 | 524288 394.983798 10 | -------------------------------------------------------------------------------- /paper_artifacts/data/Julia/CPU/times_unadaptive.txt: -------------------------------------------------------------------------------- 1 | 8 0.125069 2 | 32 0.219333 3 | 128 0.716072 4 | 512 2.721196 5 | 2048 11.163812 6 | 8192 43.882179 7 | 32768 175.232735 8 | 131072 707.112045 9 | 524288 3056.280229 10 | -------------------------------------------------------------------------------- /paper_artifacts/data/RTX_5000/CPP/MPGOS_times_adaptive.txt: -------------------------------------------------------------------------------- 1 | 8 0.144 2 | 32 0.147 3 | 128 0.152 4 | 512 0.153 5 | 2048 0.153 6 | 8192 0.245 7 | 32768 0.814 8 | 131072 3.113 9 | 524288 12.135 10 | 2097152 48.058 11 | 8388608 191.764 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/Tesla_V100/CPP/MPGOS_times_adaptive.txt: -------------------------------------------------------------------------------- 1 | 8 2.078 2 | 32 1.921 3 | 128 1.889 4 | 512 1.963 5 | 2048 1.921 6 | 8192 1.908 7 | 32768 1.942 8 | 131072 2.256 9 | 524288 3.575 10 | 2097152 11.046 11 | 8388608 48.808 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/Tesla_V100/CPP/MPGOS_times_unadaptive.txt: -------------------------------------------------------------------------------- 1 | 8 2.118 2 | 32 2.092 3 | 128 2.04 4 | 512 1.992 5 | 2048 2.075 6 | 8192 2.023 7 | 32768 2.208 8 | 131072 3.025 9 | 524288 6.421 10 | 2097152 23.62 11 | 8388608 94.157 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/RTX_5000/CPP/MPGOS_times_unadaptive.txt: -------------------------------------------------------------------------------- 1 | 8 0.426 2 | 32 0.42 3 | 128 0.474 4 | 512 0.473 5 | 2048 0.479 6 | 8192 1.121 7 | 32768 4.12 8 | 131072 16.196 9 | 524288 63.315 10 | 2097152 248.314 11 | 8388608 886.082 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/Tesla_V100/Julia/Julia_times_unadaptive.txt: -------------------------------------------------------------------------------- 1 | 8 0.231724 2 | 32 0.208332 3 | 128 0.232428 4 | 512 0.209701 5 | 2048 0.225962 6 | 8192 0.238633 7 | 32768 0.785327 8 | 131072 3.443626 9 | 524288 9.74068 10 | 2097152 36.68774 11 | 8388608 145.316839 -------------------------------------------------------------------------------- /runner_scripts/gpu/run_ode_julia.sh: -------------------------------------------------------------------------------- 1 | a=8 2 | max_a=$1 3 | while [ $a -le $max_a ] 4 | do 5 | # Print the values 6 | echo $a 7 | julia --project="./GPU_ODE_Julia/" ./GPU_ODE_Julia/bench_lorenz_gpu.jl $a 8 | # increment the value 9 | a=$((a*4)) 10 | done 11 | -------------------------------------------------------------------------------- /paper_artifacts/data/Julia/SDE/Julia_times_unadaptive.txt: -------------------------------------------------------------------------------- 1 | 8 0.381258 2 | 32 0.379741 3 | 128 0.419727 4 | 512 0.620064 5 | 2048 1.420111 6 | 8192 2.416906 7 | 32768 4.512974 8 | 131072 14.118681 9 | 524288 48.056674 10 | 2097152 181.737472 11 | 8388608 715.859498 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/Julia/devices/oneAPI/Julia_times_adaptive.txt: -------------------------------------------------------------------------------- 1 | 8 0.804296 2 | 32 0.790966 3 | 128 0.815685 4 | 512 0.888314 5 | 2048 0.877024 6 | 8192 1.022843 7 | 32768 1.186602 8 | 131072 2.071555 9 | 524288 6.023225 10 | 2097152 22.745966 11 | 8388608 88.253276 -------------------------------------------------------------------------------- /paper_artifacts/data/Julia/devices/oneAPI/Julia_times_unadaptive.txt: -------------------------------------------------------------------------------- 1 | 8 0.789936 2 | 32 0.728326 3 | 128 0.814415 4 | 512 0.896874 5 | 2048 0.892524 6 | 8192 1.235761 7 | 32768 1.983526 8 | 131072 3.438985 9 | 524288 12.007101 10 | 2097152 46.710124 11 | 8388608 185.5231 -------------------------------------------------------------------------------- /paper_artifacts/data/Tesla_V100/Julia/Julia_times_adaptive.txt: -------------------------------------------------------------------------------- 1 | 8 0.301317 2 | 32 0.301541 3 | 128 0.30454 4 | 512 0.330246 5 | 2048 0.349348 6 | 8192 0.368268 7 | 32768 0.61695 8 | 131072 1.582687 9 | 524288 4.655955 10 | 2097152 17.840218 11 | 8388608 69.451464 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/Julia/devices/CUDA/Julia_times_adaptive.txt: -------------------------------------------------------------------------------- 1 | 8 0.252927 2 | 32 0.259247 3 | 128 0.261577 4 | 512 0.272637 5 | 2048 0.294848 6 | 8192 0.370816 7 | 32768 0.754333 8 | 131072 1.665724 9 | 524288 5.511288 10 | 2097152 20.722764 11 | 8388608 83.490168 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/Julia/devices/Metal/Julia_times_adaptive.txt: -------------------------------------------------------------------------------- 1 | 8 0.967709 2 | 32 0.999 3 | 128 1.013375 4 | 512 1.187959 5 | 2048 1.373083 6 | 8192 1.471291 7 | 32768 2.381292 8 | 131072 6.039792 9 | 524288 20.657333 10 | 2097152 79.474417 11 | 8388608 317.480458 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/RTX_5000/Julia/Julia_times_adaptive.txt: -------------------------------------------------------------------------------- 1 | 8 0.252927 2 | 32 0.259247 3 | 128 0.261577 4 | 512 0.272637 5 | 2048 0.294848 6 | 8192 0.370816 7 | 32768 0.754333 8 | 131072 1.665724 9 | 524288 5.511288 10 | 2097152 20.722764 11 | 8388608 83.490168 -------------------------------------------------------------------------------- /paper_artifacts/data/Julia/CPU/SDE/times_unadaptive.txt: -------------------------------------------------------------------------------- 1 | 8 0.037384 2 | 32 0.154911 3 | 128 0.551648 4 | 512 2.20597 5 | 2048 4.844585 6 | 8192 5.201728 7 | 32768 10.771976 8 | 131072 43.050631 9 | 524288 170.593541 10 | 2097152 696.775282 11 | 8388608 6702.486593 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/Julia/devices/AMDGPU/Julia_times_adaptive.txt: -------------------------------------------------------------------------------- 1 | 8 2.231099 2 | 32 2.19581 3 | 128 2.394492 4 | 512 2.858904 5 | 2048 4.108183 6 | 8192 4.520914 7 | 32768 16.186817 8 | 131072 91.010872 9 | 524288 380.084487 10 | 2097152 1536.688493 11 | 8388608 6163.51325 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/Julia/devices/AMDGPU/Julia_times_unadaptive.txt: -------------------------------------------------------------------------------- 1 | 8 0.843078 2 | 32 0.838157 3 | 128 0.832271 4 | 512 1.349344 5 | 2048 2.322879 6 | 8192 2.332571 7 | 32768 2.372267 8 | 131072 4.596871 9 | 524288 17.053023 10 | 2097152 66.638069 11 | 8388608 273.612436 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/Julia/devices/CUDA/Julia_times_unadaptive.txt: -------------------------------------------------------------------------------- 1 | 8 0.168368 2 | 32 0.171038 3 | 128 0.211548 4 | 512 0.209778 5 | 2048 0.213208 6 | 8192 0.331017 7 | 32768 0.904711 8 | 131072 3.280279 9 | 524288 11.920956 10 | 2097152 46.264071 11 | 8388608 183.919764 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/Julia/devices/Metal/Julia_times_unadaptive.txt: -------------------------------------------------------------------------------- 1 | 8 0.716333 2 | 32 0.742291 3 | 128 0.721709 4 | 512 1.254791 5 | 2048 1.971708 6 | 8192 2.002875 7 | 32768 2.038125 8 | 131072 6.25925 9 | 524288 23.261792 10 | 2097152 90.567667 11 | 8388608 364.321583 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/RTX_5000/Julia/Julia_times_unadaptive.txt: -------------------------------------------------------------------------------- 1 | 8 0.168368 2 | 32 0.171038 3 | 128 0.211548 4 | 512 0.209778 5 | 2048 0.213208 6 | 8192 0.331017 7 | 32768 0.904711 8 | 131072 3.280279 9 | 524288 11.920956 10 | 2097152 46.264071 11 | 8388608 183.919764 -------------------------------------------------------------------------------- /runner_scripts/gpu/run_ode_pytorch.sh: -------------------------------------------------------------------------------- 1 | a=8 2 | max_a=$1 3 | while [ $a -le $max_a ] 4 | do 5 | # Print the values 6 | echo "No. of trajectories = $a" 7 | python3 ./GPU_ODE_PyTorch/bench_torchdiffeq.py $a 8 | # increment the value 9 | a=$((a*4)) 10 | done 11 | -------------------------------------------------------------------------------- /paper_artifacts/data/Julia/EnsembleGPUArray/Julia_EnGPUArray_times_adaptive.txt: -------------------------------------------------------------------------------- 1 | 8 15.947266 2 | 32 15.95629 3 | 128 16.423258 4 | 512 18.110998 5 | 2048 18.736388 6 | 8192 19.82563 7 | 32768 24.065729 8 | 131072 49.500582 9 | 524288 192.086775 10 | 2097152 785.861873 11 | 8388608 3061.065027 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/Julia/EnsembleGPUArray/Julia_EnGPUArray_times_unadaptive.txt: -------------------------------------------------------------------------------- 1 | 8 99.183004 2 | 32 100.362064 3 | 128 100.833487 4 | 512 122.938753 5 | 2048 122.718199 6 | 8192 130.923533 7 | 32768 134.314086 8 | 131072 304.430394 9 | 524288 1148.009043 10 | 2097152 4150.856981 11 | 8388608 15826.752605 12 | -------------------------------------------------------------------------------- /runner_scripts/gpu/run_ode_jax.sh: -------------------------------------------------------------------------------- 1 | a=8 2 | max_a=$1 3 | XLA_PYTHON_CLIENT_PREALLOCATE=false 4 | while [ $a -le $max_a ] 5 | do 6 | # Print the values 7 | echo "No. of trajectories = $a" 8 | python3 ./GPU_ODE_JAX/bench_diffrax.py $a 9 | # increment the value 10 | a=$((a*4)) 11 | done 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/RTX_5000/JAX/Jax_times_adaptive.txt: -------------------------------------------------------------------------------- 1 | 8 11.396918998798355 2 | 32 11.75047601282131 3 | 128 12.41002899769228 4 | 512 12.64855700719636 5 | 2048 12.49069900950417 6 | 8192 14.121492000413127 7 | 32768 21.169946994632483 8 | 131072 206.5049299999373 9 | 524288 454.2432949965587 10 | 2097152 1770.2114550047554 11 | 8388608 7067.879137990531 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/RTX_5000/JAX/Jax_times_unadaptive.txt: -------------------------------------------------------------------------------- 1 | 8 46.13151300873142 2 | 32 44.98972499277443 3 | 128 44.53597999236081 4 | 512 44.87656700075604 5 | 2048 45.046036000712775 6 | 8192 44.75162898597773 7 | 32768 80.44722399790771 8 | 131072 552.1373499941546 9 | 524288 2157.217366999248 10 | 2097152 8489.217234004172 11 | 8388608 33885.78143600898 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/RTX_5000/PyTorch/Torch_times_unadaptive.txt: -------------------------------------------------------------------------------- 1 | 8 609.113594000064 2 | 32 627.9670649998934 3 | 128 637.831831999847 4 | 512 634.4590839999 5 | 2048 624.9206489999324 6 | 8192 623.1320730000789 7 | 32768 615.158344000065 8 | 131072 644.2248739999741 9 | 524288 2594.314949999898 10 | 2097152 10222.998637000046 11 | 8388608 40889.64921499996 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/Tesla_V100/JAX/Jax_times_unadaptive.txt: -------------------------------------------------------------------------------- 1 | 8 60.93443697318435 2 | 32 61.113049974665046 3 | 128 63.0807199049741 4 | 512 67.59428302757442 5 | 2048 67.74979480542243 6 | 8192 70.7671670243144 7 | 32768 82.74353202432394 8 | 131072 197.92419183067977 9 | 524288 730.48299504444 10 | 2097152 2684.088410111144 11 | 8388608 10038.143360987306 12 | -------------------------------------------------------------------------------- /paper_artifacts/data/Tesla_V100/JAX/Jax_times_adaptive.txt: -------------------------------------------------------------------------------- 1 | 8 12.42110994644463 2 | 32 13.530798023566604 3 | 128 14.264097204431891 4 | 512 14.479244127869606 5 | 2048 15.887931920588017 6 | 8192 15.419425908476114 7 | 32768 19.164706114679575 8 | 131072 47.79849713668227 9 | 524288 190.3297039680183 10 | 2097152 680.2465228829533 11 | 8388608 2610.883393092081 12 | -------------------------------------------------------------------------------- /test_DiffEqGPU.jl: -------------------------------------------------------------------------------- 1 | using TestEnv 2 | using Pkg 3 | 4 | Pkg.add("DiffEqGPU") 5 | TestEnv.activate("DiffEqGPU") 6 | backend = ARGS[1] 7 | ENV["GROUP"] = backend 8 | Pkg.add(backend) 9 | 10 | ENV["JULIA_LOAD_PATH"]=dirname(Base.active_project()) 11 | 12 | using DiffEqGPU 13 | include(joinpath(dirname(pathof(DiffEqGPU)), "..", "test", "runtests.jl")) 14 | -------------------------------------------------------------------------------- /paper_artifacts/data/Tesla_V100/PyTorch/Torch_times_unadaptive.txt: -------------------------------------------------------------------------------- 1 | 8 775.3896450158209 2 | 32 1098.3309479197487 3 | 128 794.1124310018495 4 | 512 774.6156470384449 5 | 2048 781.6141300136223 6 | 8192 781.7080520326272 7 | 32768 785.07510793861 8 | 131072 790.6077449442819 9 | 524288 1086.9193120161071 10 | 2097152 3784.109622007236 11 | 8388608 14353.03207905963 12 | -------------------------------------------------------------------------------- /Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" 3 | DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" 4 | LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" 5 | Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" 6 | Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" 7 | StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd" 8 | TestEnv = "1e6cf692-eddd-4d53-88a5-2d735e33781b" 9 | -------------------------------------------------------------------------------- /GPU_ODE_MPGOS/makefile: -------------------------------------------------------------------------------- 1 | ROOT_DIR = $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) 2 | INCL_DIR = -I$(ROOT_DIR)/SourceCodes 3 | CMPL_OPT = -O3 -std=c++11 --ptxas-options=-v --gpu-architecture=sm_70 -lineinfo -maxrregcount=128 4 | SOURCE = Lorenz.cu 5 | 6 | all: Lorenz.exe 7 | 8 | Lorenz.exe: $(SOURCE) 9 | nvcc -o Lorenz.exe $(SOURCE) $(INCL_DIR) $(CMPL_OPT) 10 | 11 | clean: 12 | rm -f Lorenz.exe 13 | -------------------------------------------------------------------------------- /runner_scripts/cpu/run_ode_julia.sh: -------------------------------------------------------------------------------- 1 | a=8 2 | max_a=$1 3 | 4 | path="CPU" 5 | if [ -d "./data/${path}" ] 6 | then 7 | rm -rf "./data/${path}" 8 | mkdir -p "./data/${path}" 9 | else 10 | mkdir -p "./data/${path}" 11 | fi 12 | 13 | while [ $a -le $max_a ] 14 | do 15 | # Print the values 16 | echo $a 17 | julia --threads=16 --project="./GPU_ODE_Julia/" ./GPU_ODE_Julia/bench_cpu.jl $a 18 | # increment the value 19 | a=$((a*4)) 20 | done 21 | -------------------------------------------------------------------------------- /runner_scripts/gpu/run_egarray_julia.sh: -------------------------------------------------------------------------------- 1 | a=8 2 | max_a=$1 3 | path="EnsembleGPUArray" 4 | if [ -d "./data/${path}" ] 5 | then 6 | rm -rf "./data/${path}" 7 | mkdir -p "./data/${path}" 8 | else 9 | mkdir -p "./data/${path}" 10 | fi 11 | while [ $a -le $max_a ] 12 | do 13 | # Print the values 14 | echo $a 15 | julia --project="./GPU_ODE_Julia/" ./GPU_ODE_Julia/bench_ensemblegpuarray.jl $a 16 | # increment the value 17 | a=$((a*4)) 18 | done 19 | -------------------------------------------------------------------------------- /MPI/mpi_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Slurm Sbatch Options 3 | #SBATCH --gres=gpu:volta:1 4 | #SBATCH -n 5 -N 5 5 | #SBATCH --output="./mpi_scatter_test.log-%j" 6 | # Loading the required module 7 | 8 | export JULIA_CUDA_MEMORY_POOL=none 9 | export JULIA_MPI_BINARY=system 10 | export JULIA_CUDA_USE_BINARYBUILDER=false 11 | 12 | source $HOME/.bashrc 13 | module load cuda/11.6 mpi/openmpi-4.1.3 14 | 15 | srun hostname > hostfile 16 | #script 17 | time mpiexec julia gpu_ode_mpi.jl 18 | -------------------------------------------------------------------------------- /runner_scripts/gpu/run_sde_julia.sh: -------------------------------------------------------------------------------- 1 | a=8 2 | max_a=$1 3 | 4 | path="SDE" 5 | if [ -d "./data/${path}" ] 6 | then 7 | rm -rf "./data/${path}"/* || true 8 | mkdir -p "./data/${path}" 9 | else 10 | mkdir -p "./data/${path}" 11 | fi 12 | 13 | 14 | while [ $a -le $max_a ] 15 | do 16 | # Print the values 17 | echo $a 18 | julia --threads=16 --project="./GPU_ODE_Julia/" ./GPU_ODE_Julia/sde_examples/bench_gpu.jl $a 19 | # increment the value 20 | a=$((a*4)) 21 | done 22 | -------------------------------------------------------------------------------- /runner_scripts/cpu/run_sde_julia.sh: -------------------------------------------------------------------------------- 1 | a=8 2 | max_a=$1 3 | 4 | path="CPU" 5 | if [ -d "./data/${path}/SDE" ] 6 | then 7 | rm -f "./data/${path}/SDE"/* || true 8 | mkdir -p "./data/${path}/SDE" 9 | else 10 | mkdir -p "./data/${path}/SDE" 11 | fi 12 | 13 | while [ $a -le $max_a ] 14 | do 15 | # Print the values 16 | echo $a 17 | julia --threads=16 --project="./GPU_ODE_Julia/" ./GPU_ODE_Julia/sde_examples/bench_cpu.jl $a 18 | # increment the value 19 | a=$((a*4)) 20 | done 21 | -------------------------------------------------------------------------------- /runner_scripts/gpu/run_ode_mult_device.sh: -------------------------------------------------------------------------------- 1 | a=8 2 | max_a=$1 3 | backend=$2 4 | if [ -d "./data/devices/${backend}" ] 5 | then 6 | rm -rf "./data/devices/${backend}" 7 | mkdir -p "./data/devices/${backend}" 8 | else 9 | mkdir -p "./data/devices/${backend}" 10 | fi 11 | 12 | while [ $a -le $max_a ] 13 | do 14 | # Print the values 15 | echo $a 16 | julia --project="./GPU_ODE_Julia/" ./GPU_ODE_Julia/bench_multi_device.jl $a $backend 17 | # increment the value 18 | a=$((a*4)) 19 | done 20 | -------------------------------------------------------------------------------- /texture_memory/Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" 3 | CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" 4 | DataInterpolations = "82cc6244-b520-54b8-b5a6-8a565e85f1d0" 5 | DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea" 6 | Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" 7 | LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" 8 | OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed" 9 | Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" 10 | Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" 11 | StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" 12 | -------------------------------------------------------------------------------- /runner_scripts/gpu/run_sde_crn.sh: -------------------------------------------------------------------------------- 1 | a=2 2 | max_a=4 3 | 4 | path="SDE" 5 | if [ -d "./data/${path}/CRN" ] 6 | then 7 | rm -rf "./data/${path}/CRN" 8 | mkdir -p "./data/${path}/CRN" 9 | 10 | rm -rf "./data/CPU/${path}/CRN" 11 | mkdir -p "./data/CPU/${path}/CRN" 12 | else 13 | mkdir -p "./data/${path}/CRN" 14 | mkdir -p "./data/CPU/${path}/CRN" 15 | fi 16 | 17 | while [ $a -le $max_a ] 18 | do 19 | # Print the values 20 | echo $a 21 | julia --threads=16 --project="./GPU_ODE_Julia/" ./GPU_ODE_Julia/sde_examples/bench_crn_model.jl $a 22 | # increment the value 23 | a=$((a*2)) 24 | done 25 | -------------------------------------------------------------------------------- /runner_scripts/gpu/run_ode_cpp.sh: -------------------------------------------------------------------------------- 1 | a=8 2 | # max_a=$((2**24)) 3 | max_a=$1 4 | while [ $a -le $max_a ] 5 | do 6 | echo $a 7 | sed -i "15d" ./GPU_ODE_MPGOS/Lorenz.cu 8 | sed -i "15 i #define SOLVER RK4" ./GPU_ODE_MPGOS/Lorenz.cu 9 | sed -i "17d" ./GPU_ODE_MPGOS/Lorenz.cu 10 | sed -i "17 i const int NT = $a;" ./GPU_ODE_MPGOS/Lorenz.cu 11 | 12 | make clean --directory=./GPU_ODE_MPGOS/ 13 | make --directory=./GPU_ODE_MPGOS/ 14 | ./GPU_ODE_MPGOS/Lorenz.exe $a 15 | 16 | sed -i "15d" ./GPU_ODE_MPGOS/Lorenz.cu 17 | sed -i "15 i #define SOLVER RKCK45" ./GPU_ODE_MPGOS/Lorenz.cu 18 | 19 | make clean --directory=./GPU_ODE_MPGOS/ 20 | make --directory=./GPU_ODE_MPGOS/ 21 | ./GPU_ODE_MPGOS/Lorenz.exe $a 22 | # increment the value 23 | a=$((a*4)) 24 | done 25 | -------------------------------------------------------------------------------- /GPU_ODE_Julia/Project.toml: -------------------------------------------------------------------------------- 1 | name = "GPU_ODE_Julia" 2 | uuid = "d770f587-beb8-456f-87bf-3eef33441b01" 3 | authors = ["Utkarsh "] 4 | version = "0.1.0" 5 | 6 | [deps] 7 | BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" 8 | Catalyst = "479239e8-5488-4da2-87a7-35f2df7eef83" 9 | DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e" 10 | DiffEqDevTools = "f3b72e0c-5b89-59e1-b016-84e28bfd966d" 11 | DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea" 12 | ModelingToolkit = "961ee093-0014-501f-94e3-6117800e7a78" 13 | OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed" 14 | ReactionNetworkImporters = "b4db0fb7-de2a-5028-82bf-5021f5cfa881" 15 | SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462" 16 | SimpleDiffEq = "05bca326-078c-5bf0-a5bf-ce7c7982d7fd" 17 | StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" 18 | StochasticDiffEq = "789caeaf-c7a9-5a7d-9973-96adeb23e2a0" 19 | -------------------------------------------------------------------------------- /GPU_ODE_Julia/bin/ode_problems/linear.jl: -------------------------------------------------------------------------------- 1 | using Random 2 | Random.seed!(123) 3 | 4 | # 1D Linear ODE 5 | function f(u::AbstractArray{T}, p, t::T) where {T} 6 | return T(1.01) * u 7 | end 8 | function f_analytic(u₀, p, t) 9 | u₀ * exp(1.01 * t) 10 | end 11 | 12 | tspan = (0.0, 10.0) 13 | tspan = T.(tspan) 14 | u0 = @SVector rand(T, 100) 15 | prob = ODEProblem(ODEFunction(f, analytic = f_analytic), u0, tspan) 16 | 17 | ensembleProb = EnsembleProblem(prob) 18 | 19 | ### Lower level API #### 20 | 21 | ## Building problems here only 22 | I = 1:numberOfParameters 23 | if ensembleProb.safetycopy 24 | probs = map(I) do i 25 | ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1) 26 | end 27 | else 28 | probs = map(I) do i 29 | ensembleProb.prob_func(ensembleProb.prob, i, 1) 30 | end 31 | end 32 | 33 | ## Make them compatible with CUDA 34 | probs = cu(probs) 35 | dt = T(0.1) 36 | -------------------------------------------------------------------------------- /GPU_ODE_Julia/bin/ode_problems/multistate.jl: -------------------------------------------------------------------------------- 1 | using ReactionNetworkImporters, Catalyst 2 | 3 | prnbng = loadrxnetwork(BNGNetwork(), joinpath(dirname(@__DIR__), "Models/multistate.net")) 4 | 5 | rn = prnbng.rn 6 | obs = [eq.lhs for eq in observed(rn)] 7 | 8 | osys = convert(ODESystem, rn) 9 | 10 | tf = 20.0 11 | tspan = (0.0, tf) 12 | oprob = ODEProblem{false}(osys, T[], tspan, T[]) 13 | 14 | prob = make_gpu_compatible(oprob, Val(T)) 15 | 16 | @assert prob.f(prob.u0, prob.p, T(1.0f0)) isa StaticArray{<:Tuple, T} 17 | 18 | ensembleProb = EnsembleProblem(prob) 19 | 20 | ### Lower level API #### 21 | 22 | ## Building problems here only 23 | I = 1:numberOfParameters 24 | if ensembleProb.safetycopy 25 | probs = map(I) do i 26 | ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1) 27 | end 28 | else 29 | probs = map(I) do i 30 | ensembleProb.prob_func(ensembleProb.prob, i, 1) 31 | end 32 | end 33 | 34 | ## Make them compatible with CUDA 35 | probs = cu(probs) 36 | dt = T(0.001) 37 | -------------------------------------------------------------------------------- /GPU_ODE_JAX/requirements.txt: -------------------------------------------------------------------------------- 1 | asttokens==2.2.1 2 | backcall==0.2.0 3 | backports.functools-lru-cache==1.6.4 4 | comm==0.1.2 5 | debugpy==1.6.6 6 | decorator==5.1.1 7 | diffrax==0.3.1 8 | equinox==0.10.1 9 | executing==1.2.0 10 | importlib-metadata==6.1.0 11 | ipykernel==6.19.2 12 | ipython==8.11.0 13 | jax==0.4.6 14 | jaxlib==0.4.6+cuda11.cudnn82 15 | jaxtyping==0.2.14 16 | jedi==0.18.2 17 | jupyter_client==8.1.0 18 | jupyter_core==5.3.0 19 | matplotlib-inline==0.1.6 20 | nest-asyncio==1.5.6 21 | numpy==1.24.2 22 | opt-einsum==3.3.0 23 | packaging==23.0 24 | parso==0.8.3 25 | pexpect==4.8.0 26 | pickleshare==0.7.5 27 | pip==23.0.1 28 | platformdirs==3.1.1 29 | prompt-toolkit==3.0.38 30 | psutil==5.9.4 31 | ptyprocess==0.7.0 32 | pure-eval==0.2.2 33 | Pygments==2.14.0 34 | python-dateutil==2.8.2 35 | pyzmq==25.0.2 36 | scipy==1.10.1 37 | setuptools==67.6.0 38 | six==1.16.0 39 | stack-data==0.6.2 40 | tornado==6.2 41 | traitlets==5.9.0 42 | typeguard==3.0.1 43 | typing_extensions==4.5.0 44 | wcwidth==0.2.6 45 | wheel==0.40.0 46 | zipp==3.15.0 47 | -------------------------------------------------------------------------------- /GPU_ODE_Julia/src/GPU_ODE_Julia.jl: -------------------------------------------------------------------------------- 1 | module GPU_ODE_Julia 2 | using ModelingToolkit, StaticArrays, SciMLBase 3 | using DiffEqGPU 4 | 5 | function make_gpu_compatible(prob::T, ::Val{T1}) where {T <: ODEProblem, T1} 6 | sys = modelingtoolkitize(prob) 7 | prob = ODEProblem{false}(sys) 8 | remake(prob; u0 = SArray{Tuple{length(prob.u0)}, T1}(prob.u0), 9 | tspan = T1.(prob.tspan), 10 | p = prob.p isa SciMLBase.NullParameters ? prob.p : 11 | SArray{Tuple{length(prob.p)}, T1}(prob.p)) 12 | end 13 | 14 | struct GPUODE{T <: DiffEqGPU.GPUODEAlgorithm} <: SciMLBase.AbstractODEAlgorithm 15 | trajectories::Int 16 | end 17 | 18 | ## Wrapping for compat with WorkPrecisionSet 19 | function SciMLBase.__solve(prob::SciMLBase.AbstractODEProblem, alg::GPUODE{T}, args...; 20 | kwargs...) where {T} 21 | eprob = EnsembleProblem(prob) 22 | sol = solve(eprob, T(), EnsembleGPUKernel(0.0), trajectories = alg.trajectories; 23 | kwargs...) 24 | return sol[1] 25 | end 26 | 27 | export make_gpu_compatible, GPUODE 28 | 29 | end # module GPU_ODE_Julia 30 | -------------------------------------------------------------------------------- /GPU_ODE_Julia/bin/ode_problems/lorenz.jl: -------------------------------------------------------------------------------- 1 | function lorenz(u::AbstractArray{T}, p, t) where {T} 2 | du1 = T(10.0) * (u[2] - u[1]) 3 | du2 = p[1] * u[1] - u[2] - u[1] * u[3] 4 | du3 = u[1] * u[2] - T(8 // 3) * u[3] 5 | return @SVector T[du1, du2, du3] 6 | end 7 | 8 | u0 = @SVector T[1.0f0; 0.0f0; 0.0f0] 9 | tspan = (T(0.0), T(1.0)) 10 | p = @SArray T[28.0] 11 | prob = ODEProblem(lorenz, u0, tspan, p) 12 | 13 | parameterList = range(T(0.0), stop = T(21.0), length = numberOfParameters) 14 | 15 | lorenzProblem = ODEProblem(lorenz, u0, tspan, p) 16 | prob_func = (prob, i, repeat) -> remake(prob, p = @SArray [parameterList[i]]) 17 | 18 | ensembleProb = EnsembleProblem(lorenzProblem, prob_func = prob_func) 19 | 20 | ## Building problems here only 21 | I = 1:numberOfParameters 22 | if ensembleProb.safetycopy 23 | probs = map(I) do i 24 | ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1) 25 | end 26 | else 27 | probs = map(I) do i 28 | ensembleProb.prob_func(ensembleProb.prob, i, 1) 29 | end 30 | end 31 | 32 | ## Make them compatible with CUDA 33 | probs = cu(probs) 34 | dt = T(0.001) 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Utkarsh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /GPU_ODE_MPGOS/ProfileSpec.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "Bash version ${BASH_VERSION}..." 3 | 4 | FileName=$2 5 | LogFileName=$FileName'.log' 6 | ProfFileName=$FileName'.nvprof' 7 | 8 | rm -f $LogFileName 9 | 10 | echo "--- SUMMARY ---" >> $LogFileName 11 | echo >> $LogFileName 12 | 13 | nvprof --unified-memory-profiling off --profile-api-trace none ./$1 2>>$LogFileName 14 | 15 | echo >> $LogFileName 16 | echo >> $LogFileName 17 | 18 | 19 | echo "--- SPECIFIC METRICS AND EVENTS ---" >> $LogFileName 20 | echo >> $LogFileName 21 | 22 | nvprof --unified-memory-profiling off --kernels :::1 --events elapsed_cycles_sm,active_cycles --metrics sm_efficiency,achieved_occupancy,eligible_warps_per_cycle,branch_efficiency,local_load_throughput,local_store_throughput,ipc,issued_ipc,flop_count_dp_add,flop_count_dp_mul,flop_count_dp_fma,inst_integer,inst_control,inst_compute_ld_st,inst_misc,flop_dp_efficiency,l1_shared_utilization,l2_utilization,dram_utilization,ldst_fu_utilization,alu_fu_utilization,stall_pipe_busy,stall_exec_dependency,stall_memory_dependency,stall_inst_fetch,stall_not_selected,stall_memory_throttle,stall_other ./$1 2>>$LogFileName 23 | 24 | echo >> $LogFileName 25 | echo >> $LogFileName -------------------------------------------------------------------------------- /GPU_ODE_Julia/sde_examples/bench_cpu.jl: -------------------------------------------------------------------------------- 1 | using DiffEqGPU, BenchmarkTools, StaticArrays, StochasticDiffEq 2 | 3 | @show ARGS 4 | #settings 5 | 6 | numberOfParameters = isinteractive() ? 8192 : parse(Int64, ARGS[1]) 7 | 8 | # Defining the Problem 9 | # dX = pudt + qudW 10 | u₀ = SA[0.1, 0.1, 0.1] 11 | f(u, p, t) = p[1] * u 12 | g(u, p, t) = p[2] * u 13 | tspan = (0.00, 1.0) 14 | p = SA[1.5, 0.01] 15 | 16 | prob = SDEProblem(f, g, u₀, tspan, p; seed = 1234) 17 | 18 | ensembleProb = EnsembleProblem(prob) 19 | 20 | @info "Solving the problem" 21 | 22 | I = 1:numberOfParameters 23 | if ensembleProb.safetycopy 24 | probs = map(I) do i 25 | ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1) 26 | end 27 | else 28 | probs = map(I) do i 29 | ensembleProb.prob_func(ensembleProb.prob, i, 1) 30 | end 31 | end 32 | 33 | data = @benchmark solve($ensembleProb, EM(), EnsembleThreads(), dt = Float64(1 // 2^8), 34 | adaptive = false, save_everystep = false, 35 | trajectories = numberOfParameters) 36 | 37 | if !isinteractive() 38 | open(joinpath(dirname(dirname(@__DIR__)), "data", "CPU/SDE/times_unadaptive.txt"), 39 | "a+") do io 40 | println(io, numberOfParameters, " ", minimum(data.times) / 1e6) 41 | end 42 | end 43 | 44 | println("Parameter number: " * string(numberOfParameters)) 45 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms") 46 | println("Allocs: " * string(data.allocs)) 47 | -------------------------------------------------------------------------------- /GPU_ODE_Julia/bin/catalyst_models/multistate.net: -------------------------------------------------------------------------------- 1 | # Created by BioNetGen 2.7.0 2 | begin parameters 3 | 1 R0 5360 # Constant 4 | 2 L0 1160 # Constant 5 | 3 A0 5360 # Constant 6 | 4 kon 0.01 # Constant 7 | 5 koff 0.1 # Constant 8 | 6 kAon 0.01 # Constant 9 | 7 kAoff 0.1 # Constant 10 | 8 kAp 0.01 # Constant 11 | 9 kAdp 0.1 # Constant 12 | end parameters 13 | begin species 14 | 1 R(a,l) R0 15 | 2 L(r) L0 16 | 3 A(Y~U,r) A0 17 | 4 L(r!1).R(a,l!1) 0 18 | 5 A(Y~U,r!1).R(a!1,l) 0 19 | 6 A(Y~U,r!1).L(r!2).R(a!1,l!2) 0 20 | 7 A(Y~P,r!1).L(r!2).R(a!1,l!2) 0 21 | 8 A(Y~P,r!1).R(a!1,l) 0 22 | 9 A(Y~P,r) 0 23 | end species 24 | begin reactions 25 | 1 1,2 4 kon #_R1 26 | 2 1,3 5 kAon #_R2 27 | 3 2,5 6 kon #_R1 28 | 4 4 1,2 koff #_reverse__R1 29 | 5 3,4 6 kAon #_R2 30 | 6 5 1,3 kAoff #_reverse__R2 31 | 7 6 2,5 koff #_reverse__R1 32 | 8 6 3,4 kAoff #_reverse__R2 33 | 9 6 7 kAp #_R3 34 | 10 7 2,8 koff #_reverse__R1 35 | 11 7 4,9 kAoff #_reverse__R2 36 | 12 7 6 kAdp #_R4 37 | 13 2,8 7 kon #_R1 38 | 14 1,9 8 kAon #_R2 39 | 15 4,9 7 kAon #_R2 40 | 16 8 1,9 kAoff #_reverse__R2 41 | 17 8 5 kAdp #_R4 42 | 18 9 3 kAdp #_R4 43 | end reactions 44 | begin groups 45 | 1 A_P 7,8,9 46 | 2 A_unbound_P 9 47 | 3 A_bound_P 7,8 48 | 4 RLA_P 7 49 | end groups 50 | -------------------------------------------------------------------------------- /runner_scripts/plot/plot_sde_comp.jl: -------------------------------------------------------------------------------- 1 | using Plots 2 | using DelimitedFiles 3 | using Dates 4 | using Statistics 5 | using LaTeXStrings 6 | using StatsPlots 7 | 8 | times = Dict() 9 | 10 | parent_dir = 11 | length(ARGS) != 0 ? joinpath(ARGS[1], "data") : 12 | joinpath("paper_artifacts", "data", "Julia") 13 | 14 | base_path = joinpath(dirname(dirname(@__DIR__)), parent_dir) 15 | 16 | 17 | Julia_data = readdlm(joinpath(base_path, "SDE", "Julia_times_unadaptive.txt")) 18 | 19 | GPU_times = Julia_data[:, 2] .* 1e-3 20 | Ns = Julia_data[:, 1] 21 | 22 | CPU_data = readdlm(joinpath(base_path, "CPU", "SDE", "times_unadaptive.txt")) 23 | 24 | CPU_times = CPU_data[:, 2] .* 1e-3 25 | 26 | times["Fixed_CPU"] = mean(CPU_times) 27 | 28 | times["Fixed_GPU"] = mean(GPU_times) 29 | 30 | 31 | xticks = 10 .^ round.(range(1, 7, length = 10), digits = 2) 32 | 33 | yticks = 10 .^ round.(range(1, -6, length = 15), digits = 2) 34 | 35 | 36 | 37 | plt = groupedbar( 38 | log10.(Ns), 39 | [GPU_times CPU_times], 40 | labels = ["GPU: Float32" "CPU: Float64"], 41 | yaxis = :log, 42 | yticks = yticks, 43 | ylabel = "Time (s)", 44 | xlabel = "Trajectories (" * L"$10^n$" * ")", 45 | legend = :topleft, 46 | title = "Performance Comparison of solving SDEs \n between CPU and GPU", 47 | dpi = 600, 48 | ) 49 | 50 | plots_dir = joinpath(dirname(dirname(@__DIR__)), "plots") 51 | 52 | isdir(plots_dir) || mkdir(plots_dir) 53 | 54 | 55 | savefig(plt, joinpath(plots_dir, "CPU_SDE_$(Dates.value(Dates.now())).png")) 56 | -------------------------------------------------------------------------------- /GPU_ODE_Julia/bin/ode_problems/multisite2.jl: -------------------------------------------------------------------------------- 1 | using GPU_ODE_Julia 2 | using DiffEqGPU, BenchmarkTools, StaticArrays, SimpleDiffEq, ReactionNetworkImporters, 3 | Catalyst 4 | using CUDA 5 | 6 | @show ARGS 7 | #settings 8 | 9 | numberOfParameters = isinteractive() ? 2 : parse(Int64, ARGS[1]) 10 | gpuID = 0 11 | 12 | device!(CuDevice(gpuID)) 13 | println("Running on " * string(CuDevice(gpuID))) 14 | 15 | prnbng = loadrxnetwork(BNGNetwork(), joinpath(@__DIR__, "Models/multisite2.net")) 16 | 17 | rn = prnbng.rn 18 | obs = [eq.lhs for eq in observed(rn)] 19 | 20 | osys = convert(ODESystem, rn) 21 | 22 | tf = 2.0 23 | tspan = (0.0, tf) 24 | oprob = ODEProblem{false}(osys, Float64[], tspan, Float64[]) 25 | 26 | T = Float64 27 | 28 | prob = make_gpu_compatible(oprob, Val(T)) 29 | 30 | @assert prob.f(prob.u0, prob.p, T(1.0f0)) isa StaticArray{<:Tuple, T} 31 | 32 | ensembleProb = EnsembleProblem(prob) 33 | 34 | sol = solve(ensembleProb, GPUTsit5(), EnsembleGPUKernel(), trajectories = 2, dt = 0.001f0) 35 | 36 | ### Lower level API #### 37 | 38 | ## Building problems here only 39 | I = 1:numberOfParameters 40 | if ensembleProb.safetycopy 41 | probs = map(I) do i 42 | ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1) 43 | end 44 | else 45 | probs = map(I) do i 46 | ensembleProb.prob_func(ensembleProb.prob, i, 1) 47 | end 48 | end 49 | 50 | ## Make them compatible with CUDA 51 | probs = cu(probs) 52 | 53 | @info "Solving the problem" 54 | sol = @time CUDA.@sync DiffEqGPU.vectorized_asolve(probs, ensembleProb.prob, GPUTsit5(); 55 | save_everystep = false, dt = T(0.001)) 56 | -------------------------------------------------------------------------------- /GPU_ODE_PyTorch/requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.4 2 | asttokens==2.2.1 3 | backcall==0.2.0 4 | backports.functools-lru-cache==1.6.4 5 | certifi==2022.12.7 6 | charset-normalizer==3.0.1 7 | cmake==3.25.0 8 | comm==0.1.2 9 | debugpy==1.6.6 10 | decorator==5.1.1 11 | executing==1.2.0 12 | filelock==3.9.0 13 | functorch==1.13.0 14 | idna==3.4 15 | importlib-metadata==6.0.0 16 | ipykernel==6.19.2 17 | ipython==8.10.0 18 | jedi==0.18.2 19 | jupyter_client==8.0.3 20 | jupyter_core==5.2.0 21 | Mako==1.2.4 22 | MarkupSafe==2.1.2 23 | matplotlib-inline==0.1.6 24 | mkl-fft==1.3.1 25 | mkl-random==1.2.2 26 | mkl-service==2.4.0 27 | mpmath==1.2.1 28 | nest-asyncio==1.5.6 29 | networkx==3.0rc1 30 | numpy==1.23.4 31 | nvidia-cublas-cu11==11.10.3.66 32 | nvidia-cuda-nvrtc-cu11==11.7.99 33 | nvidia-cuda-runtime-cu11==11.7.99 34 | nvidia-cudnn-cu11==8.5.0.96 35 | packaging==23.0 36 | parso==0.8.3 37 | pexpect==4.8.0 38 | pickleshare==0.7.5 39 | Pillow==9.4.0 40 | pip==22.2.2 41 | platformdirs==3.0.0 42 | prompt-toolkit==3.0.36 43 | psutil==5.9.4 44 | ptyprocess==0.7.0 45 | pure-eval==0.2.2 46 | pycuda==2022.2.2 47 | Pygments==2.14.0 48 | python-dateutil==2.8.2 49 | pytools==2022.1.14 50 | pytorch-triton==2.0.0+0d7e753227 51 | pyzmq==25.0.0 52 | requests==2.28.2 53 | scipy==1.10.1 54 | setuptools==67.4.0 55 | six==1.16.0 56 | stack-data==0.6.2 57 | sympy==1.11.1 58 | torch==2.0.0.dev20230202+cu116 59 | torchaudio==2.0.0.dev20230201+cu116 60 | torchdiffeq==0.2.3 61 | torchode==0.1.1.post1 62 | torchtyping==0.1.4 63 | torchvision==0.15.0.dev20230201+cu116 64 | tornado==6.2 65 | traitlets==5.9.0 66 | typeguard==2.13.3 67 | typing_extensions==4.4.0 68 | urllib3==1.26.14 69 | wcwidth==0.2.6 70 | wheel==0.38.4 71 | zipp==3.15.0 72 | -------------------------------------------------------------------------------- /GPU_ODE_MPGOS/SourceCodes/SingleSystem_PerThread_EventHandling.cuh: -------------------------------------------------------------------------------- 1 | #ifndef SINGLESYSTEM_PERTHREAD_EVENTHANDLING_H 2 | #define SINGLESYSTEM_PERTHREAD_EVENTHANDLING_H 3 | 4 | 5 | template 6 | __forceinline__ __device__ void PerThread_EventTimeStepControl(\ 7 | int tid, \ 8 | int& r_UpdateStep, \ 9 | int r_TerminateSimulation, \ 10 | Precision* r_ActualEventValue, \ 11 | Precision* r_NextEventValue, \ 12 | Precision* s_EventTolerance, \ 13 | int* s_EventDirection, \ 14 | Precision r_TimeStep, \ 15 | Precision& r_NewTimeStep, \ 16 | Precision MinimumTimeStep) 17 | { 18 | Precision EventTimeStep = r_TimeStep; 19 | int IsCorrected = 0; 20 | 21 | if ( ( r_UpdateStep == 1 ) && ( r_TerminateSimulation == 0 ) ) 22 | { 23 | for (int i=0; i s_EventTolerance[i] ) && ( r_NextEventValue[i] < -s_EventTolerance[i] ) && ( s_EventDirection[i] <= 0 ) ) || \ 26 | ( ( r_ActualEventValue[i] < -s_EventTolerance[i] ) && ( r_NextEventValue[i] > s_EventTolerance[i] ) && ( s_EventDirection[i] >= 0 ) ) ) 27 | { 28 | EventTimeStep = MPGOS::FMIN( EventTimeStep, -r_ActualEventValue[i] / (r_NextEventValue[i]-r_ActualEventValue[i]) * r_TimeStep ); 29 | IsCorrected = 1; 30 | } 31 | } 32 | } 33 | 34 | if ( IsCorrected == 1 ) 35 | { 36 | if ( EventTimeStep < MinimumTimeStep ) 37 | { 38 | printf("Warning: Event cannot be detected without reducing the step size below the minimum! Event detection omitted!, (thread id: %d)\n", tid); 39 | } else 40 | { 41 | r_NewTimeStep = EventTimeStep; 42 | r_UpdateStep = 0; 43 | } 44 | } 45 | } 46 | 47 | #endif -------------------------------------------------------------------------------- /GPU_ODE_Julia/sde_examples/bench_gpu.jl: -------------------------------------------------------------------------------- 1 | using DiffEqGPU, DiffEqBase, StaticArrays, CUDA, BenchmarkTools 2 | 3 | @show ARGS 4 | #settings 5 | 6 | numberOfParameters = isinteractive() ? 8192 : parse(Int64, ARGS[1]) 7 | 8 | # Defining the Problem 9 | # dX = pudt + qudW 10 | u₀ = SA[0.1f0, 0.1f0, 0.1f0] 11 | f(u, p, t) = p[1] * u 12 | g(u, p, t) = p[2] * u 13 | tspan = (0.0f0, 1.0f0) 14 | p = SA[1.5f0, 0.01f0] 15 | 16 | prob = SDEProblem(f, g, u₀, tspan, p; seed = 1234) 17 | 18 | ensembleProb = EnsembleProblem(prob) 19 | 20 | ## Building problem for each trajectories. Since we just want to generate different 21 | ## time-series, the problem remains same. 22 | I = 1:numberOfParameters 23 | if ensembleProb.safetycopy 24 | probs = map(I) do i 25 | ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1) 26 | end 27 | else 28 | probs = map(I) do i 29 | ensembleProb.prob_func(ensembleProb.prob, i, 1) 30 | end 31 | end 32 | 33 | ## Move the arrays to the GPU 34 | probs = cu(probs); 35 | 36 | ## Finally use the lower API for faster solves! (Fixed time-stepping) 37 | 38 | data = @benchmark CUDA.@sync DiffEqGPU.vectorized_solve($probs, $prob, GPUEM(); 39 | save_everystep = false, 40 | dt = Float32(1 // 2^8)) 41 | 42 | if !isinteractive() 43 | open(joinpath(dirname(dirname(@__DIR__)), "data", "SDE", "Julia_times_unadaptive.txt"), 44 | "a+") do io 45 | println(io, numberOfParameters, " ", minimum(data.times) / 1e6) 46 | end 47 | end 48 | 49 | println("Parameter number: " * string(numberOfParameters)) 50 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms") 51 | println("Allocs: " * string(data.allocs)) 52 | -------------------------------------------------------------------------------- /GPU_ODE_Julia/bin/ode_problems/pleiades.jl: -------------------------------------------------------------------------------- 1 | function f!(du::AbstractArray{T}, u::AbstractArray{T}, p, t::T) where {T} 2 | @inbounds begin 3 | x = view(u, 1:7) # x 4 | y = view(u, 8:14) # y 5 | v = view(u, 15:21) # x′ 6 | w = view(u, 22:28) # y′ 7 | du[1:7] .= v 8 | du[8:14] .= w 9 | for i in 15:28 10 | du[i] = zero(u[1]) 11 | end 12 | for i in 1:7, j in 1:7 13 | if i != j 14 | r = ((x[i] - x[j])^(2.0f0) + (y[i] - y[j])^(2.0f0))^(3.0f0 / 2.0f0) 15 | du[14 + i] += j * (x[j] - x[i]) / r 16 | du[21 + i] += j * (y[j] - y[i]) / r 17 | end 18 | end 19 | end 20 | du = T.(du) 21 | end 22 | 23 | u0 = T[3.0, 3.0, -1.0, -3.0, 2.0, -2.0, 2.0, 3.0, -3.0, 2.0, 0, 0, -4.0, 4.0, 0, 0, 0, 0, 0, 24 | 1.75, -1.5, 0, 0, 0, -1.25, 1, 0, 0] 25 | tspan = (0.0, 3.0) 26 | oprob = ODEProblem(f!, u0, T.(tspan)) 27 | 28 | prob = make_gpu_compatible(oprob, Val(T)) 29 | 30 | @assert prob.f(prob.u0, prob.p, T(1.0)) isa StaticArray{<:Tuple, T} 31 | 32 | ensembleProb = EnsembleProblem(prob) 33 | dt = T(0.001) 34 | 35 | # sol = solve(ensembleProb, GPUTsit5(), EnsembleGPUKernel(), trajectories = 2, dt = 1.0f0) 36 | 37 | # ### Lower level API #### 38 | 39 | ## Building problems here only 40 | I = 1:numberOfParameters 41 | if ensembleProb.safetycopy 42 | probs = map(I) do i 43 | ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1) 44 | end 45 | else 46 | probs = map(I) do i 47 | ensembleProb.prob_func(ensembleProb.prob, i, 1) 48 | end 49 | end 50 | 51 | ## Make them compatible with CUDA 52 | probs = cu(probs) 53 | 54 | # @info "Solving the problem" 55 | # sol = @time @CUDA.sync DiffEqGPU.vectorized_asolve(probs, ensembleProb.prob, GPUTsit5(); 56 | # save_everystep = false, dt = 0.001f0) 57 | -------------------------------------------------------------------------------- /runner_scripts/plot/plot_mult_gpu.jl: -------------------------------------------------------------------------------- 1 | using Plots 2 | using DelimitedFiles 3 | using Dates 4 | using Statistics 5 | using StatsPlots 6 | using LaTeXStrings 7 | 8 | times = Dict() 9 | 10 | parent_dir = 11 | length(ARGS) != 0 ? joinpath(ARGS[1], "data") : 12 | joinpath("paper_artifacts", "data", "Julia") 13 | 14 | base_path = joinpath(dirname(dirname(@__DIR__)), parent_dir, "devices") 15 | 16 | CUDA_data = readdlm(joinpath(base_path, "CUDA", "Julia_times_unadaptive.txt")) 17 | 18 | CUDA_times = CUDA_data[:, 2] .* 1e-3 19 | Ns = CUDA_data[:, 1] 20 | 21 | oneAPI_data = readdlm(joinpath(base_path, "oneAPI", "Julia_times_unadaptive.txt")) 22 | 23 | oneAPI_times = oneAPI_data[:, 2] .* 1e-3 24 | 25 | AMDGPU_data = readdlm(joinpath(base_path, "AMDGPU", "Julia_times_unadaptive.txt")) 26 | 27 | AMDGPU_times = AMDGPU_data[:, 2] .* 1e-3 28 | 29 | Metal_data = readdlm(joinpath(base_path, "Metal", "Julia_times_unadaptive.txt")) 30 | 31 | Metal_times = Metal_data[:, 2] .* 1e-3 32 | 33 | xticks = 10 .^ round.(range(1, 7, length = 10), digits = 2) 34 | 35 | yticks = 10 .^ round.(range(2, -5, length = 15), digits = 2) 36 | 37 | s = "Trajectories (" * L"$10^n$" * ")" 38 | 39 | colors = collect(palette(:default)) 40 | 41 | plt = groupedbar( 42 | log10.(Ns), 43 | [CUDA_times oneAPI_times AMDGPU_times Metal_times], 44 | labels = ["CUDA" "oneAPI" "AMDGPU" "Metal"], 45 | yaxis = :log, 46 | yticks = yticks, 47 | ylabel = "Time (s)", 48 | xlabel = s, 49 | legend = :topleft, 50 | title = "Performance Comparison with different GPU backends", 51 | titlefontsize = 12, 52 | palette = [colors[3], colors[1], colors[2], colors[4]], 53 | dpi = 300, 54 | ) 55 | 56 | plots_dir = joinpath(dirname(dirname(@__DIR__)), "plots") 57 | 58 | isdir(plots_dir) || mkdir(plots_dir) 59 | 60 | savefig(plt, joinpath(plots_dir, "Multi_GPU_unadaptive_$(Dates.value(Dates.now())).png")) 61 | -------------------------------------------------------------------------------- /run_benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | has_n_option=false 3 | while getopts l:d:m:n: flag 4 | do 5 | case "${flag}" in 6 | l) lang=${OPTARG};; 7 | d) dev=${OPTARG};; 8 | m) model=${OPTARG};; 9 | n) nmax=${OPTARG};has_n_option=true;; 10 | \?) echo "Unknown option -$OPTARG"; exit 1;; 11 | esac 12 | done 13 | if $has_n_option; then 14 | nmax=$nmax 15 | else 16 | nmax=$((2**24)) 17 | fi 18 | echo $lang 19 | if [ $lang == "julia" ]; then 20 | echo "Benchmarking ${lang^} ${dev^^} accelerated ensemble ${model^^} solvers..." 21 | if [ $dev == "cpu" ];then 22 | cmd="./runner_scripts/${dev}/run_${model}_${lang}.sh ${nmax}" 23 | eval "$cmd" 24 | elif [ $model == "sde" ];then 25 | cmd="./runner_scripts/${dev}/run_${model}_${lang}.sh ${nmax}" 26 | eval "$cmd" 27 | else 28 | if [ -d "./data/${lang^}" ]; 29 | then 30 | rm -f "./data/${lang^}"/* 31 | mkdir -p "./data/${lang^}" 32 | else 33 | mkdir -p "./data/${lang^}" 34 | fi 35 | cmd="./runner_scripts/${dev}/run_${model}_${lang}.sh ${nmax}" 36 | eval "$cmd" 37 | fi 38 | elif [[ $lang == "jax" || $lang == "pytorch" || $lang == "cpp" ]]; then 39 | if [[ $model != "ode" || $dev != "gpu" ]]; then 40 | echo "The benchmarking of ensemble ${model^^} solvers on ${dev^^} with ${lang} is not supported. Please use -m flag with \"ode\" and -d with \"gpu\"." 41 | exit 1 42 | else 43 | echo "Benchmarking ${lang^^} ${dev^^} accelerated ensemble ${model^^} solvers..." 44 | if [ -d "./data/${lang^^}" ] 45 | then 46 | rm -rf "./data/${lang^^}"/* 47 | mkdir -p "./data/${lang^^}" 48 | else 49 | mkdir -p "./data/${lang^^}" 50 | fi 51 | cmd="./runner_scripts/${dev}/run_${model}_${lang}.sh ${nmax}" 52 | eval "$cmd" 53 | fi 54 | fi -------------------------------------------------------------------------------- /runner_scripts/plot/plot_sde_crn.jl: -------------------------------------------------------------------------------- 1 | using Plots 2 | using DelimitedFiles 3 | using Dates 4 | using Statistics 5 | using LaTeXStrings 6 | using StatsPlots 7 | 8 | times = Dict() 9 | 10 | parent_dir = 11 | length(ARGS) != 0 ? joinpath(ARGS[1], "data") : 12 | joinpath("paper_artifacts", "data", "Julia") 13 | 14 | base_path = joinpath(dirname(dirname(@__DIR__)), parent_dir) 15 | 16 | Julia_data = readdlm(joinpath(base_path, "SDE", "CRN", "Julia_times_unadaptive.txt")) 17 | 18 | GPU_times = Julia_data[:, 2] .* 1e-3 19 | Ns = Julia_data[:, 1] 20 | 21 | CPU_data = readdlm(joinpath(base_path, "CPU", "SDE", "CRN", "Julia_times_unadaptive.txt")) 22 | 23 | CPU_times = CPU_data[:, 2] .* 1e-3 24 | 25 | times["Fixed_CPU"] = mean(CPU_times) 26 | 27 | times["Fixed_GPU"] = mean(GPU_times) 28 | 29 | 30 | xticks = 10 .^ round.(range(1, 7, length = 10), digits = 2) 31 | 32 | yticks = 10 .^ round.(range(2, -3, length = 11), digits = 2) 33 | 34 | # plt = plot( 35 | # Ns, 36 | # GPU_times, 37 | # xaxis = :log, 38 | # yaxis = :log, 39 | # linewidth = 2, 40 | # label = "GPU: Float32", 41 | # ylabel = "Time (s)", 42 | # xlabel = "Trajectories", 43 | # title = "Lorenz Problem: 1000 fixed time-steps", 44 | # legend = :topleft, 45 | # xticks = xticks, 46 | # yticks = yticks, 47 | # marker = :circle, 48 | # ) 49 | 50 | 51 | 52 | plt = groupedbar( 53 | log10.(Ns), 54 | [GPU_times CPU_times], 55 | labels = ["GPU" "CPU"], 56 | yaxis = :log, 57 | yticks = yticks, 58 | ylabel = "Time (s)", 59 | xlabel = "Trajectories (" * L"$10^n$" * ")", 60 | legend = :topleft, 61 | title = "Performance Comparison of parallel-parameter \n sweeps in SDEs between CPU and GPU", 62 | dpi = 600, 63 | ) 64 | 65 | 66 | plots_dir = joinpath(dirname(dirname(@__DIR__)), "plots") 67 | 68 | isdir(plots_dir) || mkdir(plots_dir) 69 | 70 | savefig(plt, joinpath(plots_dir, "CPU_SDE_CRN_$(Dates.value(Dates.now())).png")) 71 | -------------------------------------------------------------------------------- /GPU_ODE_Julia/bench_cpu.jl: -------------------------------------------------------------------------------- 1 | using BenchmarkTools, StaticArrays, OrdinaryDiffEq 2 | 3 | @show ARGS 4 | #settings 5 | 6 | numberOfParameters = isinteractive() ? 8192 : parse(Int64, ARGS[1]) 7 | 8 | function lorenz(u, p, t) 9 | du1 = 10.0 * (u[2] - u[1]) 10 | du2 = p[1] * u[1] - u[2] - u[1] * u[3] 11 | du3 = u[1] * u[2] - 2.666 * u[3] 12 | return @SVector [du1, du2, du3] 13 | end 14 | 15 | u0 = @SVector [1.0; 0.0; 0.0] 16 | tspan = (0.0, 1.0) 17 | p = @SArray [21.0] 18 | prob = ODEProblem(lorenz, u0, tspan, p) 19 | 20 | parameterList = range(0.0, stop = 21.0, length = numberOfParameters) 21 | 22 | lorenzProblem = ODEProblem(lorenz, u0, tspan, p) 23 | prob_func = (prob, i, repeat) -> remake(prob, p = @SArray [parameterList[i]]) 24 | 25 | ensembleProb = EnsembleProblem(lorenzProblem, prob_func = prob_func) 26 | 27 | @info "Solving the problem" 28 | data = @benchmark solve($ensembleProb, Tsit5(), EnsembleThreads(), dt = 0.001, 29 | adaptive = false, save_everystep = false, 30 | trajectories = numberOfParameters) 31 | 32 | if !isinteractive() 33 | open(joinpath(dirname(@__DIR__), "data", "CPU", "times_unadaptive.txt"), "a+") do io 34 | println(io, numberOfParameters, " ", minimum(data.times) / 1e6) 35 | end 36 | end 37 | 38 | println("Parameter number: " * string(numberOfParameters)) 39 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms") 40 | println("Allocs: " * string(data.allocs)) 41 | 42 | data = @benchmark solve($ensembleProb, Tsit5(), EnsembleThreads(), dt = 0.001, 43 | adaptive = true, save_everystep = false, 44 | trajectories = numberOfParameters) 45 | 46 | if !isinteractive() 47 | open(joinpath(dirname(@__DIR__), "data", "CPU", "times_adaptive.txt"), "a+") do io 48 | println(io, numberOfParameters, " ", minimum(data.times) / 1e6) 49 | end 50 | end 51 | 52 | println("Parameter number: " * string(numberOfParameters)) 53 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms") 54 | println("Allocs: " * string(data.allocs)) 55 | -------------------------------------------------------------------------------- /GPU_ODE_MPGOS/Lorenz_SystemDefinition.cuh: -------------------------------------------------------------------------------- 1 | #ifndef T2_PERTHREAD_SYSTEMDEFINITION_H 2 | #define T2_PERTHREAD_SYSTEMDEFINITION_H 3 | 4 | // SYSTEM 5 | template 6 | __forceinline__ __device__ void PerThread_OdeFunction(\ 7 | int tid, int NT, \ 8 | Precision* F, Precision* X, Precision T, \ 9 | Precision* cPAR, Precision* sPAR, int* sPARi, Precision* ACC, int* ACCi) 10 | { 11 | F[0] = 10.0*( X[1]-X[0] ); 12 | F[1] = cPAR[0]*X[0] - X[1] - X[0]*X[2]; 13 | F[2] = X[0]*X[1] - 2.666 * X[2]; 14 | } 15 | 16 | // EVENTS 17 | template 18 | __forceinline__ __device__ void PerThread_EventFunction(\ 19 | int tid, int NT, Precision* EF, \ 20 | Precision T, Precision dT, Precision* TD, Precision* X, \ 21 | Precision* cPAR, Precision* sPAR, int* sPARi, Precision* ACC, int* ACCi) 22 | { 23 | 24 | } 25 | 26 | template 27 | __forceinline__ __device__ void PerThread_ActionAfterEventDetection(\ 28 | int tid, int NT, int IDX, int& UDT, \ 29 | Precision &T, Precision &dT, Precision* TD, Precision* X, \ 30 | Precision* cPAR, Precision* sPAR, int* sPARi, Precision* ACC, int* ACCi) 31 | { 32 | 33 | } 34 | 35 | // ACCESSORIES 36 | template 37 | __forceinline__ __device__ void PerThread_ActionAfterSuccessfulTimeStep(\ 38 | int tid, int NT, int& UDT, \ 39 | Precision& T, Precision& dT, Precision* TD, Precision* X, \ 40 | Precision* cPAR, Precision* sPAR, int* sPARi, Precision* ACC, int* ACCi) 41 | { 42 | 43 | } 44 | 45 | template 46 | __forceinline__ __device__ void PerThread_Initialization(\ 47 | int tid, int NT, int& DOIDX, \ 48 | Precision& T, Precision& dT, Precision* TD, Precision* X, \ 49 | Precision* cPAR, Precision* sPAR, int* sPARi, Precision* ACC, int* ACCi) 50 | { 51 | 52 | } 53 | 54 | template 55 | __forceinline__ __device__ void PerThread_Finalization(\ 56 | int tid, int NT, int& DOIDX, \ 57 | Precision& T, Precision& dT, Precision* TD, Precision* X, \ 58 | Precision* cPAR, Precision* sPAR, int* sPARi, Precision* ACC, int* ACCi) 59 | { 60 | 61 | } 62 | 63 | #endif -------------------------------------------------------------------------------- /GPU_ODE_MPGOS/SourceCodes/SingleSystem_PerThread_DenseOutput.cuh: -------------------------------------------------------------------------------- 1 | #ifndef SINGLESYSTEM_PERTHREAD_DENSEOUTPUT_H 2 | #define SINGLESYSTEM_PERTHREAD_DENSEOUTPUT_H 3 | 4 | 5 | template 6 | __forceinline__ __device__ void PerThread_StoreDenseOutput(\ 7 | int tid, \ 8 | int r_UpdateDenseOutput, \ 9 | int& r_DenseOutputIndex, \ 10 | Precision* d_DenseOutputTimeInstances, \ 11 | Precision r_ActualTime, \ 12 | Precision* d_DenseOutputStates, \ 13 | Precision* r_ActualState, \ 14 | int& r_NumberOfSkippedStores, \ 15 | Precision& r_DenseOutputActualTime, \ 16 | Precision DenseOutputMinimumTimeStep, \ 17 | Precision UpperTimeDomain) 18 | { 19 | if ( r_UpdateDenseOutput == 1 ) 20 | { 21 | d_DenseOutputTimeInstances[tid + r_DenseOutputIndex*NT] = r_ActualTime; 22 | 23 | int DenseOutputStateIndex = tid + r_DenseOutputIndex*NT*SD; 24 | for (int i=0; i 41 | __forceinline__ __device__ void PerThread_DenseOutputStorageCondition(\ 42 | Precision r_ActualTime, \ 43 | Precision r_DenseOutputActualTime, \ 44 | int r_DenseOutputIndex, \ 45 | int r_NumberOfSkippedStores, \ 46 | int r_EndTimeDomainReached, \ 47 | int r_UserDefinedTermination, \ 48 | int& r_UpdateDenseOutput, \ 49 | Struct_SolverOptions SolverOptions) 50 | { 51 | if ( ( r_DenseOutputIndex < NDO ) && ( r_DenseOutputActualTime < r_ActualTime ) && ( r_NumberOfSkippedStores >= (SolverOptions.DenseOutputSaveFrequency-1) ) ) 52 | r_UpdateDenseOutput = 1; 53 | else 54 | r_UpdateDenseOutput = 0; 55 | 56 | if ( ( r_DenseOutputIndex < NDO ) && ( ( r_EndTimeDomainReached == 1 ) || ( r_UserDefinedTermination == 1 ) ) ) 57 | r_UpdateDenseOutput = 1; 58 | } 59 | 60 | #endif -------------------------------------------------------------------------------- /GPU_ODE_MPGOS/SourceCodes/MPGOS_Overloaded_MathFunction.cuh: -------------------------------------------------------------------------------- 1 | #ifndef MPGOS_OVERLOADED_MATHFUNCTIONS_H 2 | #define MPGOS_OVERLOADED_MATHFUNCTIONS_H 3 | 4 | namespace MPGOS 5 | { 6 | // Floating point absolute value 7 | __forceinline__ __device__ float FABS(float a) 8 | { 9 | return fabsf(a); 10 | } 11 | 12 | __forceinline__ __device__ double FABS(double a) 13 | { 14 | return fabs(a); 15 | } 16 | 17 | 18 | // Floating point maximum ------------------------------------------------- 19 | __forceinline__ __device__ float FMAX(float a, float b) 20 | { 21 | return fmaxf(a, b); 22 | } 23 | 24 | __forceinline__ __device__ double FMAX(double a, double b) 25 | { 26 | return fmax(a, b); 27 | } 28 | 29 | 30 | // Floating point minimum ------------------------------------------------- 31 | __forceinline__ __device__ float FMIN(float a, float b) 32 | { 33 | return fminf(a, b); 34 | } 35 | 36 | __forceinline__ __device__ double FMIN(double a, double b) 37 | { 38 | return fmin(a, b); 39 | } 40 | 41 | // Floating point atomic minimum ------------------------------------------ 42 | __forceinline__ __device__ float atomicFMIN(float* address, float val) 43 | { 44 | int ret = __float_as_int(*address); 45 | while ( val < __int_as_float(ret) ) 46 | { 47 | int old = ret; 48 | if ( ( ret = atomicCAS((int *)address, old, __float_as_int(val)) ) == old ) 49 | break; 50 | } 51 | return __int_as_float(ret); 52 | } 53 | 54 | __forceinline__ __device__ double atomicFMIN(double *address, double val) 55 | { 56 | unsigned long long ret = __double_as_longlong(*address); 57 | while ( val < __longlong_as_double(ret) ) 58 | { 59 | unsigned long long old = ret; 60 | if ( ( ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val)) ) == old ) 61 | break; 62 | } 63 | return __longlong_as_double(ret); 64 | } 65 | 66 | // Floating point atomic maximum ------------------------------------------ 67 | __forceinline__ __device__ float atomicFMAX(float *address, float val) 68 | { 69 | int ret = __float_as_int(*address); 70 | while ( val > __int_as_float(ret) ) 71 | { 72 | int old = ret; 73 | if ( (ret = atomicCAS((int *)address, old, __float_as_int(val)) ) == old ) 74 | break; 75 | } 76 | return __int_as_float(ret); 77 | } 78 | 79 | __forceinline__ __device__ double atomicFMAX(double *address, double val) 80 | { 81 | unsigned long long ret = __double_as_longlong(*address); 82 | while ( val > __longlong_as_double(ret) ) 83 | { 84 | unsigned long long old = ret; 85 | if ( (ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val)) ) == old ) 86 | break; 87 | } 88 | return __longlong_as_double(ret); 89 | } 90 | } 91 | 92 | #endif -------------------------------------------------------------------------------- /GPU_ODE_PyTorch/bench_torchdiffeq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | # %% 4 | # Benchmarking torchdiffeq ODE solvers for ensemble problems, via vmap. The Lorenz ODE is integrated by Tsit5. 5 | 6 | # Created By: Utkarsh 7 | # Last Updated: 19 April 2023 8 | 9 | # %% 10 | 11 | import torch 12 | import sys 13 | import os 14 | import timeit 15 | import sys 16 | 17 | numberOfParameters = int(sys.argv[1]) 18 | 19 | # %% 20 | 21 | 22 | import torchdiffeq 23 | import math 24 | import torch.nn as nn 25 | import timeit 26 | from torchdiffeq import odeint 27 | 28 | 29 | # %% 30 | 31 | ## Checking if torch installation has cuda enabled 32 | print("CUDA enabled: ", torch.has_cuda) 33 | 34 | 35 | # %% 36 | # Defining the Lorenz ODE problem 37 | class LorenzODE(torch.nn.Module): 38 | 39 | def __init__(self, rho = torch.tensor(21.0)): 40 | super(LorenzODE, self).__init__() 41 | self.sigma = nn.Parameter(torch.as_tensor([10.0])) 42 | self.rho = nn.Parameter(rho) 43 | self.beta = nn.Parameter(torch.as_tensor([8/3])) 44 | 45 | def forward(self, t, u): 46 | x, y, z = u[0],u[1],u[2] 47 | du1 = self.sigma[0] * (y - x) 48 | du2 = x * (self.rho - z) - y 49 | du3 = x * y - self.beta[0] * z 50 | return torch.stack([du1, du2, du3]) 51 | 52 | 53 | # %% 54 | # Uncomment for smoke test 55 | 56 | # u0 = torch.tensor([1.0,0.0,0.0]).cuda() 57 | # t = torch.linspace(0, 1.0, 1001).cuda() 58 | # y = odeint(LorenzODE(), u0, t, method='rk4',options=dict(step_size=0.001)) 59 | 60 | 61 | # %% 62 | # Define the solve without gradient calculations 63 | # Note: I was't able to JIT compile the code with this application, torchdiffeq + vmap 64 | def solve(p): 65 | with torch.no_grad(): 66 | traj = odeint(LorenzODE(rho = p), u0, t, method='rk4', options=dict(step_size=0.001)) 67 | return traj 68 | 69 | # Define the initial conditions and timepoints to save 70 | u0 = torch.tensor([1.0,0.0,0.0]).cuda() 71 | t = torch.linspace(0, 1.0, 2).cuda() 72 | 73 | 74 | # %% 75 | # Generate parameter list 76 | parameters = torch.linspace(0.0,21.0,numberOfParameters).cuda() 77 | 78 | 79 | # %% 80 | 81 | import timeit 82 | res = timeit.repeat(lambda: torch.vmap(solve)(parameters), repeat = 10, number = 1) 83 | 84 | 85 | # %% 86 | # Print the best result 87 | 88 | best_time = min(res)*1000 89 | print("{:} ODE solves with fixed time-stepping completed in {:.1f} ms".format(numberOfParameters, best_time)) 90 | 91 | 92 | # %% 93 | # Save the result 94 | 95 | file = open("./data/PYTORCH/Torch_times_unadaptive.txt","a+") 96 | file.write('{0} {1}\n'.format(numberOfParameters, best_time)) 97 | file.close() 98 | 99 | 100 | # %% 101 | -------------------------------------------------------------------------------- /MPI/gpu_ode_mpi.jl: -------------------------------------------------------------------------------- 1 | """ 2 | Scaling GPU ODE solvers to mulitple GPU cluster nodes with MPI. 3 | 4 | Created by: Utkarsh 5 | Last Modified: 20 April 2023 6 | """ 7 | 8 | using MPI 9 | using CUDA 10 | using DiffEqGPU, StaticArrays, CUDA, DiffEqBase 11 | using BenchmarkTools 12 | 13 | function split_count(N::Integer, n::Integer) 14 | q, r = divrem(N, n) 15 | return [i <= r ? q + 1 : q for i = 1:n] 16 | end 17 | 18 | 19 | MPI.Init() 20 | 21 | comm = MPI.COMM_WORLD 22 | rank = MPI.Comm_rank(comm) 23 | comm_size = MPI.Comm_size(comm) 24 | 25 | root = 0 26 | 27 | function lorenz(u, p, t) 28 | σ = p[1] 29 | ρ = p[2] 30 | β = p[3] 31 | du1 = σ * (u[2] - u[1]) 32 | du2 = u[1] * (ρ - u[3]) - u[2] 33 | du3 = u[1] * u[2] - β * u[3] 34 | return SVector{3}(du1, du2, du3) 35 | end 36 | 37 | u0 = @SVector [1.0f0; 0.0f0; 0.0f0] 38 | tspan = (0.0f0, 10.0f0) 39 | p = @SVector [10.0f0, 28.0f0, 8 / 3.0f0] 40 | prob = ODEProblem{false}(lorenz, u0, tspan, p) 41 | 42 | function perform_ode_solve(prob, parameter) 43 | trajectories = length(parameter) 44 | probs = map(1:trajectories) do i 45 | remake(prob, p = @SVector [10.0f0, parameter[i], 8 / 3.0f0]) 46 | end 47 | 48 | ## Move the arrays to the GPU 49 | probs = cu(probs) 50 | 51 | ts, us = DiffEqGPU.vectorized_asolve( 52 | probs, 53 | prob, 54 | GPUTsit5(); 55 | saveat = [prob.tspan[2]], 56 | dt = 0.1f0, 57 | ) 58 | end 59 | 60 | if rank == root 61 | M, N = 1, 2^30 62 | 63 | test = collect(LinRange(0.0f0, 21.0f0, N)) 64 | output = CuArray{typeof(u0)}(undef, (1, N)) 65 | 66 | N_counts = split_count(N, comm_size - 1) 67 | 68 | sizes = pushfirst!(N_counts, 0) 69 | size_ubuf = UBuffer(sizes, 1) 70 | 71 | counts = sizes 72 | 73 | test_vbuf = VBuffer(test, counts) # VBuffer for scatter 74 | output_vbuf = VBuffer(output, counts) # VBuffer for gather 75 | else 76 | # these variables can be set to `nothing` on non-root processes 77 | size_ubuf = UBuffer(nothing) 78 | output_vbuf = test_vbuf = VBuffer(nothing) 79 | end 80 | 81 | MPI.Barrier(comm) 82 | 83 | local_size = MPI.Scatter(size_ubuf, NTuple{1,Int}, root, comm) 84 | local_test = MPI.Scatterv!(test_vbuf, zeros(Float32, local_size), root, comm) 85 | 86 | if rank != root 87 | ts, us = perform_ode_solve(prob, local_test) 88 | else 89 | us = CuArray{typeof(u0)}(undef, (1, 0)) 90 | end 91 | 92 | MPI.Barrier(comm) 93 | 94 | @show MPI.Get_processor_name(), size(us) 95 | 96 | MPI.Gatherv!(us, output_vbuf, root, comm) 97 | 98 | MPI.Barrier(comm) 99 | 100 | if rank == root 101 | println() 102 | println("Final matrix") 103 | println("================") 104 | @show size(output) 105 | end 106 | -------------------------------------------------------------------------------- /GPU_ODE_Julia/bench_lorenz_gpu.jl: -------------------------------------------------------------------------------- 1 | using Pkg 2 | 3 | Pkg.instantiate() 4 | Pkg.precompile() 5 | using DiffEqGPU, BenchmarkTools, StaticArrays, SimpleDiffEq 6 | using CUDA 7 | 8 | @show ARGS 9 | #settings 10 | 11 | numberOfParameters = isinteractive() ? 8192 : parse(Int64, ARGS[1]) 12 | 13 | function lorenz(u, p, t) 14 | du1 = 10.0f0 * (u[2] - u[1]) 15 | du2 = p[1] * u[1] - u[2] - u[1] * u[3] 16 | du3 = u[1] * u[2] - 2.666f0 * u[3] 17 | return @SVector [du1, du2, du3] 18 | end 19 | 20 | u0 = @SVector [1.0f0; 0.0f0; 0.0f0] 21 | tspan = (0.0f0, 1.0f0) 22 | p = @SArray [21.0f0] 23 | prob = ODEProblem(lorenz, u0, tspan, p) 24 | 25 | parameterList = range(0.0f0, stop = 21.0f0, length = numberOfParameters) 26 | 27 | lorenzProblem = ODEProblem(lorenz, u0, tspan, p) 28 | prob_func = (prob, i, repeat) -> remake(prob, p = @SArray [parameterList[i]]) 29 | 30 | ensembleProb = EnsembleProblem(lorenzProblem, prob_func = prob_func) 31 | 32 | ## Building problems here only 33 | I = 1:numberOfParameters 34 | if ensembleProb.safetycopy 35 | probs = map(I) do i 36 | ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1) 37 | end 38 | else 39 | probs = map(I) do i 40 | ensembleProb.prob_func(ensembleProb.prob, i, 1) 41 | end 42 | end 43 | 44 | ## Make them compatible with CUDA 45 | probs = cu(probs) 46 | 47 | @info "Solving the problem" 48 | data = @benchmark CUDA.@sync DiffEqGPU.vectorized_solve($probs, $ensembleProb.prob, 49 | GPUTsit5(); 50 | save_everystep = false, 51 | dt = 0.001f0) 52 | 53 | if !isinteractive() 54 | open(joinpath(dirname(@__DIR__), "data", "Julia", "Julia_times_unadaptive.txt"), 55 | "a+") do io 56 | println(io, numberOfParameters, " ", minimum(data.times) / 1e6) 57 | end 58 | end 59 | 60 | println("Parameter number: " * string(numberOfParameters)) 61 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms") 62 | println("Allocs: " * string(data.allocs)) 63 | 64 | data = @benchmark CUDA.@sync DiffEqGPU.vectorized_asolve($probs, $ensembleProb.prob, 65 | GPUTsit5(); 66 | dt = 0.001f0, reltol = 1.0f-8, 67 | abstol = 1.0f-8) 68 | 69 | if !isinteractive() 70 | open(joinpath(dirname(@__DIR__), "data", "Julia", "Julia_times_adaptive.txt"), 71 | "a+") do io 72 | println(io, numberOfParameters, " ", minimum(data.times) / 1e6) 73 | end 74 | end 75 | 76 | println("Parameter number: " * string(numberOfParameters)) 77 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms") 78 | println("Allocs: " * string(data.allocs)) 79 | -------------------------------------------------------------------------------- /GPU_ODE_JAX/environment.yml: -------------------------------------------------------------------------------- 1 | name: venv_jax 2 | channels: 3 | - nvidia/label/cuda-11.8.0 4 | - conda-forge 5 | - defaults 6 | - anaconda 7 | dependencies: 8 | - _libgcc_mutex=0.1=conda_forge 9 | - _openmp_mutex=4.5=2_gnu 10 | - asttokens=2.2.1=pyhd8ed1ab_0 11 | - backcall=0.2.0=pyh9f0ad1d_0 12 | - backports=1.0=pyhd8ed1ab_3 13 | - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 14 | - bzip2=1.0.8=h7f98852_4 15 | - ca-certificates=2022.12.7=ha878542_0 16 | - comm=0.1.2=pyhd8ed1ab_0 17 | - cuda-nvcc=11.8.89=0 18 | - cudatoolkit=11.8.0=h37601d7_11 19 | - cudnn=8.4.1.50=hed8a83a_0 20 | - debugpy=1.6.6=py39h227be39_0 21 | - decorator=5.1.1=pyhd8ed1ab_0 22 | - executing=1.2.0=pyhd8ed1ab_0 23 | - importlib-metadata=6.1.0=pyha770c72_0 24 | - importlib_metadata=6.1.0=hd8ed1ab_0 25 | - ipykernel=6.19.2=py39hb070fc8_0 26 | - ipython=8.11.0=pyh41d4057_0 27 | - jedi=0.18.2=pyhd8ed1ab_0 28 | - jupyter_client=8.1.0=pyhd8ed1ab_0 29 | - jupyter_core=5.3.0=py39hf3d152e_0 30 | - ld_impl_linux-64=2.40=h41732ed_0 31 | - libffi=3.4.2=h7f98852_5 32 | - libgcc-ng=12.2.0=h65d4601_19 33 | - libgomp=12.2.0=h65d4601_19 34 | - libnsl=2.0.0=h7f98852_0 35 | - libsodium=1.0.18=h36c2ea0_1 36 | - libsqlite=3.40.0=h753d276_0 37 | - libstdcxx-ng=12.2.0=h46fd767_19 38 | - libuuid=2.32.1=h7f98852_1000 39 | - libzlib=1.2.13=h166bdaf_4 40 | - matplotlib-inline=0.1.6=pyhd8ed1ab_0 41 | - ncurses=6.3=h27087fc_1 42 | - nest-asyncio=1.5.6=pyhd8ed1ab_0 43 | - openssl=3.1.0=h0b41bf4_0 44 | - packaging=23.0=pyhd8ed1ab_0 45 | - parso=0.8.3=pyhd8ed1ab_0 46 | - pexpect=4.8.0=pyh1a96a4e_2 47 | - pickleshare=0.7.5=py_1003 48 | - pip=23.0.1=pyhd8ed1ab_0 49 | - platformdirs=3.1.1=pyhd8ed1ab_0 50 | - prompt-toolkit=3.0.38=pyha770c72_0 51 | - prompt_toolkit=3.0.38=hd8ed1ab_0 52 | - psutil=5.9.4=py39hb9d737c_0 53 | - ptyprocess=0.7.0=pyhd3deb0d_0 54 | - pure_eval=0.2.2=pyhd8ed1ab_0 55 | - pygments=2.14.0=pyhd8ed1ab_0 56 | - python=3.9.16=h2782a2a_0_cpython 57 | - python-dateutil=2.8.2=pyhd8ed1ab_0 58 | - python_abi=3.9=3_cp39 59 | - pyzmq=25.0.2=py39h0be026e_0 60 | - readline=8.1.2=h0f457ee_0 61 | - setuptools=67.6.0=pyhd8ed1ab_0 62 | - six=1.16.0=pyh6c4a22f_0 63 | - stack_data=0.6.2=pyhd8ed1ab_0 64 | - tk=8.6.12=h27826a3_0 65 | - tornado=6.2=py39hb9d737c_1 66 | - traitlets=5.9.0=pyhd8ed1ab_0 67 | - typing-extensions=4.5.0=hd8ed1ab_0 68 | - typing_extensions=4.5.0=pyha770c72_0 69 | - tzdata=2022g=h191b570_0 70 | - wcwidth=0.2.6=pyhd8ed1ab_0 71 | - wheel=0.40.0=pyhd8ed1ab_0 72 | - xz=5.2.6=h166bdaf_0 73 | - zeromq=4.3.4=h9c3ff4c_1 74 | - zipp=3.15.0=pyhd8ed1ab_0 75 | - pip: 76 | - diffrax==0.3.1 77 | - equinox==0.10.1 78 | - jax==0.4.6 79 | - jaxlib==0.4.6+cuda11.cudnn82 80 | - jaxtyping==0.2.14 81 | - numpy==1.24.2 82 | - opt-einsum==3.3.0 83 | - scipy==1.10.1 84 | - typeguard==3.0.1 -------------------------------------------------------------------------------- /runner_scripts/plot/plot_ode_comp.jl: -------------------------------------------------------------------------------- 1 | using Plots 2 | using DelimitedFiles 3 | using Dates 4 | using Statistics 5 | 6 | using Plots.PlotMeasures 7 | 8 | 9 | parent_dir = 10 | length(ARGS) != 0 ? joinpath(ARGS[1], "data") : 11 | joinpath("paper_artifacts", "data", "RTX_5000") 12 | 13 | base_path = joinpath(dirname(dirname(@__DIR__)), parent_dir) 14 | 15 | times_v100 = Dict() 16 | 17 | Julia_data = readdlm(joinpath(base_path, "Julia", "Julia_times_unadaptive.txt")) 18 | 19 | Julia_times = Julia_data[:, 2] .* 1e-3 20 | Ns = Julia_data[:, 1] 21 | 22 | MPGOS_data = readdlm(joinpath(base_path, "MPGOS", "MPGOS_times_unadaptive.txt")) 23 | 24 | MPGOS_times = MPGOS_data[:, 2] .* 1e-3 25 | 26 | JAX_data = readdlm(joinpath(base_path, "JAX", "Jax_times_unadaptive.txt")) 27 | 28 | JAX_times = JAX_data[:, 2] .* 1e-3 29 | 30 | Torch_data = readdlm(joinpath(base_path, "PyTorch", "Torch_times_unadaptive.txt")) 31 | 32 | Torch_times = Torch_data[:, 2] .* 1e-3 33 | 34 | times_v100["Fixed_Julia"] = 35 | (minimum(Julia_times ./ Julia_times), maximum(Julia_times ./ Julia_times)) 36 | 37 | times_v100["Fixed_JAX"] = 38 | (minimum(JAX_times ./ Julia_times), maximum(JAX_times ./ Julia_times)) 39 | 40 | times_v100["Fixed_MPGOS"] = 41 | (minimum(MPGOS_times ./ Julia_times), maximum(MPGOS_times ./ Julia_times)) 42 | 43 | times_v100["Fixed_Torch"] = 44 | (minimum(Torch_times ./ Julia_times), maximum(Torch_times ./ Julia_times)) 45 | 46 | xticks = 10 .^ round.(range(1, 7, length = 13), digits = 2) 47 | 48 | yticks = 10 .^ round.(range(2, -5, length = 15), digits = 2) 49 | gr(size = (810, 540)) 50 | plt = plot( 51 | Ns, 52 | Julia_times, 53 | xaxis = :log, 54 | yaxis = :log, 55 | linewidth = 2, 56 | label = "Julia", 57 | ylabel = "Time (s)", 58 | xlabel = "Trajectories", 59 | title = "Lorenz Problem: 1000 fixed time-steps", 60 | legend = :topleft, 61 | xticks = xticks, 62 | yticks = yticks, 63 | color = :Green, 64 | marker = :circle, 65 | dpi = 600, 66 | # left_margin = mm, bottom_margin = 4mm,top_margin = 6mm,right_margin = 6mm 67 | ) 68 | 69 | plt = plot!( 70 | Ns, 71 | MPGOS_times, 72 | xaxis = :log, 73 | yaxis = :log, 74 | linewidth = 2, 75 | label = "MPGOS", 76 | color = :Orange, 77 | marker = :circle, 78 | ) 79 | 80 | plt = plot!( 81 | Ns, 82 | JAX_times, 83 | xaxis = :log, 84 | yaxis = :log, 85 | linewidth = 2, 86 | label = "JAX", 87 | color = :Red, 88 | marker = :circle, 89 | ) 90 | 91 | plt = plot!( 92 | Ns, 93 | Torch_times, 94 | xaxis = :log, 95 | yaxis = :log, 96 | linewidth = 2, 97 | label = "PyTorch", 98 | color = :DarkRed, 99 | marker = :circle, 100 | ) 101 | 102 | plots_dir = joinpath(dirname(dirname(@__DIR__)), "plots") 103 | 104 | isdir(plots_dir) || mkdir(plots_dir) 105 | 106 | 107 | savefig(plt, joinpath(plots_dir, "Lorenz_unadaptive_$(Dates.value(Dates.now())).png")) 108 | -------------------------------------------------------------------------------- /GPU_ODE_Julia/bench_ensemblegpuarray.jl: -------------------------------------------------------------------------------- 1 | """ 2 | Benchmarking of the Julia's EnsembleGPUArray GPU acceleration. The implementation is similar 3 | to the vectorized map approach. The timings are stored in ./data folder, with Julia_EnGPUArray_times 4 | ".txt" file. 5 | 6 | Created by: Utkarsh 7 | Last Updated: 18 April 2023 8 | """ 9 | 10 | using DiffEqGPU, BenchmarkTools, StaticArrays, OrdinaryDiffEq 11 | using CUDA 12 | 13 | @show ARGS 14 | #settings 15 | 16 | numberOfParameters = isinteractive() ? 8388608 : parse(Int64, ARGS[1]) 17 | 18 | function lorenz(u, p, t) 19 | du1 = 10.0f0 * (u[2] - u[1]) 20 | du2 = p[1] * u[1] - u[2] - u[1] * u[3] 21 | du3 = u[1] * u[2] - 2.666f0 * u[3] 22 | return @SVector [du1, du2, du3] 23 | end 24 | 25 | u0 = @SVector [1.0f0; 0.0f0; 0.0f0] 26 | tspan = (0.0f0, 1.0f0) 27 | p = @SArray [21.0f0] 28 | prob = ODEProblem(lorenz, u0, tspan, p) 29 | 30 | ## parameter list uniformly varying the single lorenz parameter 31 | parameterList = range(0.0f0, stop = 21.0f0, length = numberOfParameters) 32 | 33 | lorenzProblem = ODEProblem(lorenz, u0, tspan, p) 34 | prob_func = (prob, i, repeat) -> remake(prob, p = @SArray [parameterList[i]]) 35 | 36 | ensembleProb = EnsembleProblem(lorenzProblem, prob_func = prob_func) 37 | 38 | batch = 1:numberOfParameters 39 | if ensembleProb.safetycopy 40 | probs = map(batch) do i 41 | ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1) 42 | end 43 | else 44 | probs = map(batch) do i 45 | ensembleProb.prob_func(ensembleProb.prob, i, 1) 46 | end 47 | end 48 | 49 | @info "Solving the problem" 50 | data = @benchmark CUDA.@sync DiffEqGPU.vectorized_map_solve($probs, RK4(), 51 | EnsembleGPUArray(0.0), $batch, 52 | false, dt = 0.001f0, 53 | save_everystep = false, 54 | dense = false) 55 | 56 | if !isinteractive() 57 | open(joinpath(dirname(@__DIR__), "data", "EnsembleGPUArray", 58 | "Julia_EnGPUArray_times_unadaptive.txt"), "a+") do io 59 | println(io, numberOfParameters, " ", minimum(data.times) / 1e6) 60 | end 61 | end 62 | 63 | println("Parameter number: " * string(numberOfParameters)) 64 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms") 65 | println("Allocs: " * string(data.allocs)) 66 | 67 | data = @benchmark CUDA.@sync DiffEqGPU.vectorized_map_solve($probs, Tsit5(), 68 | EnsembleGPUArray(0.0), $batch, 69 | true, dt = 0.001f0, 70 | save_everystep = false, 71 | dense = false) 72 | 73 | if !isinteractive() 74 | open(joinpath(dirname(@__DIR__), "data", "EnsembleGPUArray", 75 | "Julia_EnGPUArray_times_adaptive.txt"), "a+") do io 76 | println(io, numberOfParameters, " ", minimum(data.times) / 1e6) 77 | end 78 | end 79 | 80 | println("Parameter number: " * string(numberOfParameters)) 81 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms") 82 | println("Allocs: " * string(data.allocs)) 83 | -------------------------------------------------------------------------------- /GPU_ODE_MPGOS/SourceCodes/SingleSystem_PerThread_ExplicitRungeKutta_ErrorControllers.cuh: -------------------------------------------------------------------------------- 1 | #ifndef SINGLESYSTEM_PERTHREAD_EXPLICITRUNGEKUTTA_ERRORCONTROLLERS_H 2 | #define SINGLESYSTEM_PERTHREAD_EXPLICITRUNGEKUTTA_ERRORCONTROLLERS_H 3 | 4 | 5 | template 6 | __forceinline__ __device__ void PerThread_ErrorController_RK4(\ 7 | int tid, \ 8 | Precision InitialTimeStep, \ 9 | int& r_IsFinite, \ 10 | int& r_TerminateSimulation, \ 11 | Precision& r_NewTimeStep) 12 | { 13 | if ( r_IsFinite == 0 ) 14 | { 15 | printf("Error: State is not a finite number. Try to use smaller step size. (thread id: %d)\n", tid); 16 | r_TerminateSimulation = 1; 17 | } 18 | 19 | r_NewTimeStep = InitialTimeStep; 20 | } 21 | 22 | 23 | template 24 | __forceinline__ __device__ void PerThread_ErrorController_RKCK45(\ 25 | int tid, \ 26 | Precision r_TimeStep, \ 27 | Precision* r_ActualState, \ 28 | Precision* r_NextState, \ 29 | Precision* r_Error, \ 30 | Precision* s_RelativeTolerance, \ 31 | Precision* s_AbsoluteTolerance, \ 32 | int& r_UpdateStep, \ 33 | int& r_IsFinite, \ 34 | int& r_TerminateSimulation, \ 35 | Precision& r_NewTimeStep, \ 36 | Struct_SolverOptions SolverOptions) 37 | { 38 | Precision RelativeError = 1e30; 39 | Precision ErrorTolerance; 40 | Precision TimeStepMultiplicator; 41 | 42 | for (int i=0; i(0.9) * pow(RelativeError, static_cast(1.0/5.0) ); 52 | else 53 | TimeStepMultiplicator = static_cast(0.9) * pow(RelativeError, static_cast(1.0/4.0) ); 54 | 55 | if ( isfinite(TimeStepMultiplicator) == 0 ) 56 | r_IsFinite = 0; 57 | 58 | 59 | if ( r_IsFinite == 0 ) 60 | { 61 | TimeStepMultiplicator = SolverOptions.TimeStepShrinkLimit; 62 | r_UpdateStep = 0; 63 | 64 | if ( r_TimeStep < (SolverOptions.MinimumTimeStep*static_cast(1.01)) ) 65 | { 66 | printf("Error: State is not a finite number even with the minimal step size. Try to use less stringent tolerances. (thread id: %d)\n", tid); 67 | r_TerminateSimulation = 1; 68 | } 69 | } else 70 | { 71 | if ( r_TimeStep < (SolverOptions.MinimumTimeStep*static_cast(1.01)) ) 72 | { 73 | printf("Warning: Minimum step size reached! Continue with fixed minimum step size! Tolerance cannot be guaranteed!, thread id: %d, time step: %+6.5e, min step size: %+6.5e \n", tid, r_TimeStep, SolverOptions.MinimumTimeStep); 74 | r_UpdateStep = 1; 75 | } 76 | } 77 | 78 | 79 | TimeStepMultiplicator = MPGOS::FMIN(TimeStepMultiplicator, SolverOptions.TimeStepGrowLimit); 80 | TimeStepMultiplicator = MPGOS::FMAX(TimeStepMultiplicator, SolverOptions.TimeStepShrinkLimit); 81 | 82 | r_NewTimeStep = r_TimeStep * TimeStepMultiplicator; 83 | 84 | r_NewTimeStep = MPGOS::FMIN(r_NewTimeStep, SolverOptions.MaximumTimeStep); 85 | r_NewTimeStep = MPGOS::FMAX(r_NewTimeStep, SolverOptions.MinimumTimeStep); 86 | } 87 | 88 | #endif -------------------------------------------------------------------------------- /GPU_ODE_JAX/bench_diffrax.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | # %% 4 | # Benchmarking Diffrax ODE solvers for ensemble problems, via vmap. The Lorenz ODE is integrated by Tsit5. 5 | 6 | # Created By: Utkarsh 7 | # Last Updated: 19 April 2023 8 | 9 | 10 | # %% 11 | import time 12 | 13 | import diffrax 14 | import equinox as eqx 15 | import jax 16 | import jax.numpy as jnp 17 | import numpy as np 18 | import os 19 | import timeit 20 | import sys 21 | 22 | numberOfParameters = int(sys.argv[1]) 23 | 24 | # %% 25 | 26 | 27 | from jax.lib import xla_bridge 28 | print("Working on :", xla_bridge.get_backend().platform) 29 | 30 | 31 | # %% 32 | # Defining the Lorenz Problem 33 | class Lorenz(eqx.Module): 34 | k1: float 35 | 36 | def __call__(self, t, y, args): 37 | f0 = 10.0*(y[1] - y[0]) 38 | f1 = self.k1 * y[0] - y[1] - y[0] * y[2] 39 | f2 = y[0] * y[1] - (8/3)*y[2] 40 | return jnp.stack([f0, f1, f2]) 41 | 42 | 43 | # %% 44 | # JIT compilation of ODE solver 45 | @jax.jit 46 | @jax.vmap 47 | def main(k1): 48 | lorenz = Lorenz(k1) 49 | terms = diffrax.ODETerm(lorenz) 50 | t0 = 0.0 51 | t1 = 1.0 52 | y0 = jnp.array([1.0, 0.0, 0.0]) 53 | dt0 = 0.001 54 | solver = diffrax.Tsit5() 55 | saveat = diffrax.SaveAt(ts = jnp.array([t0,t1])) 56 | stepsize_controller = diffrax.PIDController(rtol=1e-6, atol=1e-3) 57 | sol = diffrax.diffeqsolve( 58 | terms, 59 | solver, 60 | t0, 61 | t1, 62 | dt0, 63 | y0, 64 | ) 65 | return sol 66 | 67 | # %% 68 | # Setting up parameters for parallel simulation 69 | parameterList = jnp.linspace(0.0,21.0,numberOfParameters) 70 | 71 | # Test that vmap and JIT ordering does not make a noticeable difference: 72 | # https://colab.research.google.com/drive/1d7G-O5JX31lHbg7jTzzozbo5-Gp7DBEv?usp=sharing 73 | 74 | # %% 75 | # Use jax.vmap to compute parallel solutions of the ODE 76 | res = timeit.repeat(lambda: main(parameterList),repeat = 100,number = 1) 77 | 78 | best_time = min(res)*1000 79 | print("{:} ODE solves with fixed time-stepping completed in {:.1f} ms".format(numberOfParameters, best_time)) 80 | 81 | 82 | # %% 83 | # Save the minimum time 84 | file = open("./data/JAX/Jax_times_unadaptive.txt","a+") 85 | file.write('{0} {1}\n'.format(numberOfParameters, best_time)) 86 | file.close() 87 | 88 | 89 | # %% 90 | # Repeat the same for adaptive time-stepping 91 | @jax.jit 92 | @jax.vmap 93 | def main(k1): 94 | lorenz = Lorenz(k1) 95 | terms = diffrax.ODETerm(lorenz) 96 | t0 = 0.0 97 | t1 = 1.0 98 | y0 = jnp.array([1.0, 0.0, 0.0]) 99 | dt0 = 0.001 100 | solver = diffrax.Tsit5() 101 | saveat = diffrax.SaveAt(ts = jnp.array([t0,t1])) 102 | stepsize_controller = diffrax.PIDController(rtol=1e-8, atol=1e-8) 103 | sol = diffrax.diffeqsolve( 104 | terms, 105 | solver, 106 | t0, 107 | t1, 108 | dt0, 109 | y0, 110 | # saveat=saveat, 111 | stepsize_controller=stepsize_controller, 112 | ) 113 | return sol 114 | 115 | 116 | # %% 117 | 118 | 119 | import timeit 120 | 121 | 122 | # %% 123 | 124 | 125 | res = timeit.repeat(lambda: main(parameterList),repeat = 100,number = 1) 126 | 127 | 128 | # %% 129 | 130 | best_time = min(res)*1000 131 | print("{:} ODE solves with adaptive time-stepping completed in {:.1f} ms".format(numberOfParameters, best_time)) 132 | 133 | 134 | # %% 135 | 136 | 137 | file = open("./data/JAX/Jax_times_adaptive.txt","a+") 138 | file.write('{0} {1}\n'.format(numberOfParameters, best_time)) 139 | file.close() 140 | 141 | -------------------------------------------------------------------------------- /runner_scripts/plot/plot_cpu_comp.jl: -------------------------------------------------------------------------------- 1 | using Plots 2 | using DelimitedFiles 3 | using Dates 4 | using Statistics 5 | 6 | gr(size = (720, 480)) 7 | 8 | times = Dict() 9 | 10 | parent_dir = 11 | length(ARGS) != 0 ? joinpath(ARGS[1], "data") : joinpath("paper_artifacts", "data") 12 | 13 | base_path = joinpath(dirname(dirname(@__DIR__)), parent_dir, "Julia") 14 | 15 | if length(ARGS) != 0 16 | Julia_data = readdlm(joinpath(base_path, "Julia_times_unadaptive.txt")) 17 | else 18 | Julia_data = readdlm( 19 | joinpath( 20 | dirname(dirname(@__DIR__)), 21 | parent_dir, 22 | "Tesla_V100", 23 | "Julia", 24 | "Julia_times_unadaptive.txt", 25 | ), 26 | ) 27 | end 28 | 29 | GPU_times = Julia_data[:, 2][1:9] .* 1e-3 30 | Ns = Julia_data[:, 1][1:9] 31 | 32 | Julia_EGArray_data = readdlm( 33 | joinpath(base_path, "EnsembleGPUArray", "Julia_EnGPUArray_times_unadaptive.txt"), 34 | ) 35 | 36 | GPU_EGArray_times = Julia_EGArray_data[:, 2][1:9] .* 1e-3 37 | 38 | CPU_data = readdlm(joinpath(base_path, "CPU", "times_unadaptive.txt")) 39 | 40 | CPU_times = CPU_data[:, 2] .* 1e-3 41 | 42 | times["Fixed_CPU"] = mean(CPU_times ./ GPU_times) 43 | 44 | times["Fixed_GPU"] = mean(GPU_times ./ GPU_times) 45 | 46 | times["Fixed_GPU_vmap"] = mean(GPU_EGArray_times ./ GPU_times) 47 | 48 | xticks = 10 .^ round.(range(1, 7, length = 13), digits = 2) 49 | 50 | yticks = 10 .^ round.(range(2, -5, length = 15), digits = 2) 51 | 52 | plt = plot( 53 | Ns, 54 | GPU_times, 55 | xaxis = :log, 56 | yaxis = :log, 57 | linewidth = 2, 58 | label = "EnsembleGPUKernel: Fixed dt", 59 | ylabel = "Time (s)", 60 | xlabel = "Trajectories", 61 | title = "Bechmarking the Lorenz Problem", 62 | legend = :topleft, 63 | xticks = xticks, 64 | yticks = yticks, 65 | marker = :circle, 66 | dpi = 600, 67 | color = :Green, 68 | ) 69 | 70 | 71 | plt = plot!( 72 | Ns, 73 | CPU_times, 74 | xaxis = :log, 75 | yaxis = :log, 76 | linewidth = 2, 77 | label = "CPU: Fixed dt", 78 | marker = :circle, 79 | color = :Orange, 80 | ) 81 | 82 | plt = plot!( 83 | Ns, 84 | GPU_EGArray_times, 85 | xaxis = :log, 86 | yaxis = :log, 87 | linewidth = 2, 88 | label = "EnsembleGPUArray: Fixed dt", 89 | marker = :circle, 90 | color = :Red, 91 | ) 92 | 93 | 94 | plots_dir = joinpath(dirname(dirname(@__DIR__)), "plots") 95 | 96 | isdir(plots_dir) || mkdir(plots_dir) 97 | 98 | 99 | if length(ARGS) != 0 100 | Julia_data = readdlm(joinpath(base_path, "Julia_times_adaptive.txt")) 101 | else 102 | Julia_data = readdlm( 103 | joinpath( 104 | dirname(dirname(@__DIR__)), 105 | parent_dir, 106 | "Tesla_V100", 107 | "Julia", 108 | "Julia_times_adaptive.txt", 109 | ), 110 | ) 111 | end 112 | 113 | GPU_times = Julia_data[:, 2][1:9] .* 1e-3 114 | Ns = Julia_data[:, 1][1:9] 115 | 116 | Julia_EGArray_data = 117 | readdlm(joinpath(base_path, "EnsembleGPUArray", "Julia_EnGPUArray_times_adaptive.txt")) 118 | 119 | GPU_EGArray_times = Julia_EGArray_data[:, 2][1:9] .* 1e-3 120 | 121 | CPU_data = readdlm(joinpath(base_path, "CPU", "times_unadaptive.txt")) 122 | 123 | CPU_times = CPU_data[:, 2] .* 1e-3 124 | 125 | times["Adaptive_CPU"] = mean(CPU_times ./ GPU_times) 126 | 127 | times["Adaptive_GPU"] = mean(GPU_times ./ GPU_times) 128 | 129 | times["Adaptive_GPU_vmap"] = mean(GPU_EGArray_times ./ GPU_times) 130 | 131 | 132 | plt = plot!( 133 | Ns, 134 | GPU_times, 135 | xaxis = :log, 136 | yaxis = :log, 137 | linewidth = 2, 138 | marker = :ltriangle, 139 | dpi = 600, 140 | color = :Green, 141 | label = "EnsembleGPUKernel: Adaptive dt", 142 | ) 143 | 144 | plt = plot!( 145 | Ns, 146 | CPU_times, 147 | xaxis = :log, 148 | yaxis = :log, 149 | linewidth = 2, 150 | label = "CPU: Adaptive dt", 151 | marker = :ltriangle, 152 | color = :Orange, 153 | ) 154 | 155 | plt = plot!( 156 | Ns, 157 | GPU_EGArray_times, 158 | xaxis = :log, 159 | yaxis = :log, 160 | linewidth = 2, 161 | label = "EnsembleGPUArray: Adaptive dt", 162 | marker = :ltriangle, 163 | color = :Red, 164 | ) 165 | 166 | savefig(plt, joinpath(plots_dir, "CPU_Lorenz_adaptive_$(Dates.value(Dates.now())).png")) 167 | -------------------------------------------------------------------------------- /texture_memory/test_texturemem.jl: -------------------------------------------------------------------------------- 1 | using CUDA, DiffEqGPU, OrdinaryDiffEq, Plots, Serialization, StaticArrays, Distributions, LinearAlgebra 2 | import DataInterpolations 3 | const DI = DataInterpolations 4 | 5 | trajectories = 100 6 | u0 = @SVector [0.0f0, 0.0f0, 10000.0f0, 0f0, 0f0, 0f0] 7 | tspan = (0.0f0, 50.0f0) 8 | saveat = LinRange(tspan..., 100) 9 | p = @SVector [25f0, 225f0, 9.807f0] 10 | CdS_dist = Normal(0f0, 1f0) 11 | 12 | ## Example where interpolation is performed on GPU 13 | 14 | data = deserialize("forecast.txt") 15 | N = length(data.altitude) 16 | 17 | weather_sa = map(data.altitude, data.windx, data.windy, data.density) do alt, wx, wy, ρ 18 | SVector{4}(alt, wx, wy, ρ) 19 | end 20 | 21 | data = deserialize("forecast.txt") 22 | N = length(data.altitude) 23 | 24 | weather_sa = map(data.altitude, data.windx, data.windy, data.density) do alt, wx, wy, ρ 25 | SVector{4}(alt, wx, wy, ρ) 26 | end 27 | 28 | weather_sa = SVector{length(weather_sa)}(weather_sa) 29 | 30 | interp = DI.LinearInterpolation{true}(hcat(weather_sa...),data.altitude) 31 | 32 | function get_weather(itp::DI.LinearInterpolation, z) 33 | weather = itp(z) 34 | wind = SVector{3}(weather[2], weather[3], 0f0) 35 | ρ = weather[4] 36 | wind, ρ 37 | end 38 | 39 | 40 | ### solving the ODE on GPU + Interpolation using DataInterpolations 41 | 42 | function ballistic_gpu(u, p, t) 43 | CdS, mass, g = p[1] 44 | interp = p[2] 45 | vel = @view u[4:6] 46 | 47 | wind, ρ = get_weather(interp, u[3]) 48 | 49 | airvelocity = vel - wind 50 | airspeed = norm(airvelocity) 51 | accel = -(ρ * CdS * airspeed) / (2 * mass) * airvelocity - mass*SVector{3}(0f0, 0f0, g) 52 | 53 | return SVector{6}(vel..., accel...) 54 | end 55 | 56 | 57 | prob_interp = ODEProblem{false}(ballistic_gpu, u0, tspan, (p, interp)) 58 | 59 | prob_func = (prob, i, repeat) -> remake(prob_interp, p = (p + SVector{3}(rand(CdS_dist), 0f0, 0f0), interp)) 60 | eprob_interp = EnsembleProblem(prob_interp, prob_func = prob_func, safetycopy = false) 61 | 62 | esol_gpu = solve(eprob_interp, GPUTsit5(), EnsembleGPUKernel(CUDABackend(), 0.0); trajectories, saveat) 63 | 64 | using BenchmarkTools 65 | 66 | @benchmark esol_gpu = solve(eprob_interp, GPUTsit5(), EnsembleGPUKernel(CUDABackend(), 0.0); trajectories, saveat) 67 | 68 | 69 | ## Replace interpolation with textured-memory 70 | 71 | 72 | weather = map(weather_sa) do w 73 | (w...,) 74 | end 75 | 76 | weather_TA = CuTextureArray(weather) 77 | texture = CuTexture(weather_TA; address_mode = CUDA.ADDRESS_MODE_CLAMP, normalized_coordinates = true, interpolation = CUDA.LinearInterpolation()) 78 | 79 | ## Test Texture interpolation 80 | idx = LinRange(0f0, 1f0, 4000) 81 | idx_gpu = CuArray(idx) 82 | idx_tlu = (1f0-1f0/N)*idx_gpu .+ 0.5f0/N # normalized table lookup form https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#table-lookup 83 | dst_gpu = CuArray{NTuple{4, Float32}}(undef, size(idx)) 84 | dst_gpu .= getindex.(Ref(texture), idx_tlu) # interpolation ℝ->ℝ⁴ 85 | 86 | 87 | def_zmax = data.altitude[end] 88 | N = length(data.altitude) 89 | @inline function get_weather(tex, z, zmax, N) 90 | idx = (1f0-1f0/N)*z/zmax + 0.5f0/N # normalized input for table lookup based on https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#table-lookup 91 | weather = tex[idx] 92 | wind = SVector{3}(weather[2], weather[3], 0f0) 93 | ρ = weather[4] 94 | wind, ρ 95 | end 96 | 97 | ### Experimentation 98 | 99 | function ballistic_t(u, p, t) 100 | CdS, mass, g = p[1] 101 | interp = p[2] 102 | zmax = p[3] 103 | N = p[4] 104 | vel = @view u[4:6] 105 | 106 | wind, ρ = get_weather(interp, u[3], zmax, N) 107 | 108 | airvelocity = vel - wind 109 | airspeed = norm(airvelocity) 110 | accel = -(ρ * CdS * airspeed) / (2 * mass) * airvelocity - mass*SVector{3}(0f0, 0f0, g) 111 | 112 | return SVector{6}(vel..., accel...) 113 | end 114 | 115 | prob_tx = ODEProblem(ballistic_t, u0, tspan, (p, texture, def_zmax, N)) 116 | 117 | using Adapt 118 | 119 | function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuArray{<:ODEProblem}) 120 | # first convert the contained ODE problems 121 | y = CuArray(adapt.(Ref(to), Array(x))) 122 | # continue doing what the default method does 123 | Base.unsafe_convert(CuDeviceArray{eltype(y),ndims(y),CUDA.AS.Global}, y) 124 | end 125 | 126 | prob_func = (prob, i, repeat) -> remake(prob, p = (p + SVector{3}(rand(CdS_dist), 0f0, 0f0), texture, def_zmax, N)) 127 | eprob_texture = EnsembleProblem(prob_tx, prob_func = prob_func, safetycopy = false) 128 | 129 | esol_gpu = solve(eprob_texture, GPUTsit5(), EnsembleGPUKernel(CUDABackend(), 0.0); trajectories, saveat) 130 | 131 | @benchmark esol_gpu = solve(eprob_texture, GPUTsit5(), EnsembleGPUKernel(CUDABackend(), 0.0); trajectories, saveat) 132 | -------------------------------------------------------------------------------- /GPU_ODE_Julia/sde_examples/bench_crn_model.jl: -------------------------------------------------------------------------------- 1 | 2 | using Catalyst, Plots, StochasticDiffEq, StaticArrays, CUDA 3 | 4 | @show ARGS 5 | #settings 6 | 7 | N = isinteractive() ? 10 : parse(Int64, ARGS[1]) 8 | 9 | ### Example 4: Ensemble SDE simulations (varioous parameter values) at steady state behaviours of 4 variable CRN (Generalised bacterial stress response model). ### 10 | 11 | ## Credits: Torkel Loman 12 | 13 | # Declare the model (using Catalyst). 14 | σGen_system = @reaction_network begin 15 | (v0 + (S * σ)^n / ((S * σ)^n + (D * A3)^n + 1), 1.0), ∅ ↔ σ 16 | (σ / τ, 1 / τ), ∅ ↔ A1 17 | (A1 / τ, 1 / τ), ∅ ↔ A2 18 | (A2 / τ, 1 / τ), ∅ ↔ A3 19 | end S D τ v0 n η 20 | 21 | # Declares the parameter values. 22 | σGen_parameters = [:S => 2.3, :D => 5.0, :τ => 10.0, :v0 => 0.1, :n => 3, :η => 0.1] 23 | 24 | # Set ensemble parameter values. 25 | S_grid = Float32.(10 .^ (range(-1.0, stop = 2, length = N))) 26 | D_grid = Float32.(10 .^ (range(-1, stop = 2, length = N))) 27 | τ_grid = Float32[0.1, 0.15, 0.20, 0.30, 0.50, 0.75, 1.0, 1.5, 2.0, 3.0, 5.0, 7.50, 10.0, 28 | 15.0, 20.0, 30.0, 50.0, 75.0, 100.0][1:2:19] 29 | v0_grid = Float32[0.01, 0.02, 0.03, 0.05, 0.075, 0.1, 0.15, 0.20] 30 | n_grid = Float32[2.0, 3.0, 4.0] 31 | η_grid = Float32[0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1] 32 | 33 | parameters = collect(Iterators.product(S_grid, D_grid, τ_grid, v0_grid, n_grid, η_grid)); 34 | 35 | numberOfParameters = length(parameters) 36 | 37 | @show numberOfParameters 38 | 39 | function σGen_p_func(prob, i, repeat) 40 | for parameter in parameters 41 | remake(prob; p = SVector{6, T1}(parameter)) 42 | end 43 | end 44 | 45 | # Declare initial condition. 46 | σGen_u0 = [:σ => 0.1, :A1 => 0.1, :A2 => 0.1, :A3 => 0.1] # (for some S values, the system will start far away from the steady state). 47 | 48 | # Create EnsembleProblem. 49 | σGen_sprob = SDEProblem(σGen_system, σGen_u0, (0.0, 1000.0), σGen_parameters, 50 | noise_scaling = (@parameters η)[1]) 51 | 52 | ### Experimentation 53 | sys = modelingtoolkitize(σGen_sprob) 54 | T1 = Float32 55 | prob = SDEProblem{false}(sys, SVector{length(σGen_sprob.u0), T1}(σGen_sprob.u0), 56 | Float32.(σGen_sprob.tspan), 57 | SVector{length(σGen_sprob.p), T1}(σGen_sprob.p), 58 | noise_rate_prototype = SMatrix{ 59 | size(σGen_sprob.noise_rate_prototype)..., 60 | T1}(σGen_sprob.noise_rate_prototype)) 61 | 62 | using DiffEqGPU 63 | 64 | # parameter as cartesian product of the ranges, initial condition as [v0,v0,v0,v0] 65 | function prob_func(prob, i, repeat) 66 | remake(prob; p = SVector{6, T1}(parameters[i]...), 67 | u0 = SVector{4, T1}(parameters[i][4], parameters[i][4], parameters[i][4], 68 | parameters[i][4])) 69 | end 70 | 71 | eprob = EnsembleProblem(prob, prob_func = prob_func, safetycopy = false) 72 | 73 | saveat = T1(0.0f0):T1(1.0f0):T1(1000.0f0) 74 | dt = T1(0.1f0) 75 | 76 | probs = map(1:numberOfParameters) do i 77 | prob_func(prob, i, false) 78 | end; 79 | 80 | ### Benchmarking 81 | using BenchmarkTools 82 | 83 | @info "Solving the problem: GPU" 84 | 85 | ## Move the arrays to the GPU 86 | gpuprobs = cu(probs); 87 | 88 | data = @benchmark CUDA.@sync DiffEqGPU.vectorized_solve($gpuprobs, $prob, GPUEM(); 89 | save_everystep = false, 90 | dt = 0.1f0) 91 | 92 | if !isinteractive() 93 | open(joinpath(dirname(dirname(@__DIR__)), "data", "SDE", "CRN", 94 | "Julia_times_unadaptive.txt"), "a+") do io 95 | println(io, numberOfParameters, " ", minimum(data.times) / 1e6) 96 | end 97 | end 98 | println("Parameter number: " * string(numberOfParameters)) 99 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms") 100 | println("Allocs: " * string(data.allocs)) 101 | 102 | @info "Solving the problem: CPU" 103 | 104 | data = @benchmark solve($eprob, EM(), EnsembleThreads(), dt = 0.1f0, adaptive = false, 105 | save_everystep = false, trajectories = numberOfParameters) 106 | 107 | # data = @benchmark DiffEqGPU.vectorized_solve($probs, $prob, GPUEM(); 108 | # save_everystep = false, 109 | # dt = 0.1f0) 110 | 111 | if !isinteractive() 112 | open(joinpath(dirname(dirname(@__DIR__)), "data", "CPU", "SDE", "CRN", 113 | "Julia_times_unadaptive.txt"), "a+") do io 114 | println(io, numberOfParameters, " ", minimum(data.times) / 1e6) 115 | end 116 | end 117 | 118 | println("Parameter number: " * string(numberOfParameters)) 119 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms") 120 | println("Allocs: " * string(data.allocs)) 121 | -------------------------------------------------------------------------------- /GPU_ODE_Julia/bench_multi_device.jl: -------------------------------------------------------------------------------- 1 | 2 | using DiffEqGPU, BenchmarkTools, StaticArrays, SimpleDiffEq 3 | 4 | @show ARGS 5 | #settings 6 | 7 | numberOfParameters = isinteractive() ? 8192 : parse(Int64, ARGS[1]) 8 | 9 | function lorenz(u, p, t) 10 | du1 = 10.0f0 * (u[2] - u[1]) 11 | du2 = p[1] * u[1] - u[2] - u[1] * u[3] 12 | du3 = u[1] * u[2] - 2.666f0 * u[3] 13 | return @SVector [du1, du2, du3] 14 | end 15 | 16 | u0 = @SVector [1.0f0; 0.0f0; 0.0f0] 17 | tspan = (0.0f0, 1.0f0) 18 | p = @SArray [21.0f0] 19 | prob = ODEProblem(lorenz, u0, tspan, p) 20 | 21 | parameterList = range(0.0f0, stop = 21.0f0, length = numberOfParameters) 22 | 23 | lorenzProblem = ODEProblem(lorenz, u0, tspan, p) 24 | prob_func = (prob, i, repeat) -> remake(prob, p = @SArray [parameterList[i]]) 25 | 26 | ensembleProb = EnsembleProblem(lorenzProblem, prob_func = prob_func) 27 | 28 | ## Building problems here only 29 | I = 1:numberOfParameters 30 | if ensembleProb.safetycopy 31 | probs = map(I) do i 32 | ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1) 33 | end 34 | else 35 | probs = map(I) do i 36 | ensembleProb.prob_func(ensembleProb.prob, i, 1) 37 | end 38 | end 39 | 40 | ## Make them compatible with Backend 41 | probs = if ARGS[2] == "CUDA" 42 | using CUDA 43 | cu(probs) 44 | elseif ARGS[2] == "oneAPI" 45 | using oneAPI 46 | probs |> oneArray 47 | elseif ARGS[2] == "AMDGPU" 48 | using AMDGPU 49 | roc(probs) 50 | elseif ARGS[2] == "Metal" 51 | using Metal 52 | probs |> MtlArray 53 | end 54 | 55 | @info "Solving the problem" 56 | data = if ARGS[2] == "CUDA" 57 | @benchmark CUDA.@sync DiffEqGPU.vectorized_solve($probs, $ensembleProb.prob, 58 | GPUTsit5(); 59 | save_everystep = false, 60 | dt = 0.001f0) 61 | elseif ARGS[2] == "oneAPI" 62 | @benchmark oneAPI.@sync DiffEqGPU.vectorized_solve($probs, $ensembleProb.prob, 63 | GPUTsit5(); 64 | save_everystep = false, 65 | dt = 0.001f0) 66 | elseif ARGS[2] == "AMDGPU" 67 | @benchmark DiffEqGPU.vectorized_solve($probs, $ensembleProb.prob, 68 | GPUTsit5(); 69 | save_everystep = false, 70 | dt = 0.001f0) 71 | elseif ARGS[2] == "Metal" 72 | @benchmark Metal.@sync DiffEqGPU.vectorized_solve($probs, $ensembleProb.prob, 73 | GPUTsit5(); 74 | save_everystep = false, 75 | dt = 0.001f0) 76 | end 77 | 78 | if !isinteractive() 79 | open(joinpath(dirname(@__DIR__), "data", "devices", ARGS[2], 80 | "Julia_times_unadaptive.txt"), "a+") do io 81 | println(io, numberOfParameters, " ", minimum(data.times) / 1e6) 82 | end 83 | end 84 | 85 | println("Parameter number: " * string(numberOfParameters)) 86 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms") 87 | println("Allocs: " * string(data.allocs)) 88 | 89 | data = if ARGS[2] == "CUDA" 90 | @benchmark CUDA.@sync DiffEqGPU.vectorized_asolve($probs, $ensembleProb.prob, 91 | GPUTsit5(); 92 | dt = 0.001f0, reltol = 1.0f-8, 93 | abstol = 1.0f-8) 94 | 95 | elseif ARGS[2] == "oneAPI" 96 | @benchmark oneAPI.@sync DiffEqGPU.vectorized_asolve($probs, $ensembleProb.prob, 97 | GPUTsit5(); 98 | dt = 0.001f0, reltol = 1.0f-8, 99 | abstol = 1.0f-8) 100 | 101 | elseif ARGS[2] == "AMDGPU" 102 | @benchmark DiffEqGPU.vectorized_asolve($probs, $ensembleProb.prob, 103 | GPUTsit5(); 104 | dt = 0.001f0, reltol = 1.0f-8, 105 | abstol = 1.0f-8) 106 | 107 | elseif ARGS[2] == "Metal" 108 | @benchmark Metal.@sync DiffEqGPU.vectorized_asolve($probs, $ensembleProb.prob, 109 | GPUTsit5(); 110 | dt = 0.001f0, reltol = 1.0f-8, 111 | abstol = 1.0f-8) 112 | end 113 | 114 | if !isinteractive() 115 | open(joinpath(dirname(@__DIR__), "data", "devices", ARGS[2], 116 | "Julia_times_adaptive.txt"), "a+") do io 117 | println(io, numberOfParameters, " ", minimum(data.times) / 1e6) 118 | end 119 | end 120 | 121 | println("Parameter number: " * string(numberOfParameters)) 122 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms") 123 | println("Allocs: " * string(data.allocs)) 124 | -------------------------------------------------------------------------------- /GPU_ODE_MPGOS/Lorenz.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "Lorenz_SystemDefinition.cuh" 8 | #include "SingleSystem_PerThread_Interface.cuh" 9 | 10 | #define PI 3.14159265358979323846 11 | 12 | using namespace std; 13 | 14 | // Solver Configuration 15 | #define SOLVER RKCK45 16 | #define PRECISION float // float, double 17 | const int NT = 8; 18 | const int SD = 3; // SystemDimension 19 | const int NCP = 1; // NumberOfControlParameters 20 | const int NSP = 0; // NumberOfSharedParameters 21 | const int NISP = 0; // NumberOfIntegerSharedParameters 22 | const int NE = 0; // NumberOfEvents 23 | const int NA = 0; // NumberOfAccessories 24 | const int NIA = 0; // NumberOfIntegerAccessories 25 | const int NDO = 0; // NumberOfPointsOfDenseOutput 26 | 27 | void Linspace(vector&, PRECISION, PRECISION, int); 28 | void FillSolverObject(ProblemSolver&, const vector&, int); 29 | void SaveData(ProblemSolver&, int); 30 | 31 | int main(int argc, char *argv[]) 32 | { 33 | int NumberOfProblems = NT; 34 | int BlockSize = 1024; 35 | 36 | ListCUDADevices(); 37 | 38 | int MajorRevision = 3; 39 | int MinorRevision = 5; 40 | int SelectedDevice = SelectDeviceByClosestRevision(MajorRevision, MinorRevision); 41 | 42 | PrintPropertiesOfSpecificDevice(SelectedDevice); 43 | 44 | 45 | int NumberOfParameters_R = NumberOfProblems; 46 | PRECISION R_RangeLower = 0.0; 47 | PRECISION R_RangeUpper = 21.0; 48 | vector Parameters_R_Values(NumberOfParameters_R,0); 49 | Linspace(Parameters_R_Values, R_RangeLower, R_RangeUpper, NumberOfParameters_R); 50 | 51 | 52 | ProblemSolver ScanLorenz(SelectedDevice); 53 | 54 | ScanLorenz.SolverOption(ThreadsPerBlock, BlockSize); 55 | ScanLorenz.SolverOption(InitialTimeStep, 1.0e-3); 56 | 57 | 58 | clock_t SimulationStart; 59 | clock_t SimulationEnd; 60 | 61 | FillSolverObject(ScanLorenz, Parameters_R_Values, NT); 62 | 63 | ScanLorenz.SynchroniseFromHostToDevice(All); 64 | 65 | SimulationStart = clock(); 66 | ScanLorenz.Solve(); 67 | ScanLorenz.InsertSynchronisationPoint(); 68 | ScanLorenz.SynchroniseSolver(); 69 | SimulationEnd = clock(); 70 | 71 | ScanLorenz.SynchroniseFromDeviceToHost(All); 72 | 73 | cout << "Total simulation time: " << 1000.0*(SimulationEnd-SimulationStart) / CLOCKS_PER_SEC << "ms" << endl; 74 | cout << "Simulation time / 1000 RK4 step: " << 1000.0*(SimulationEnd-SimulationStart) / CLOCKS_PER_SEC << "ms" << endl; 75 | cout << "Ensemble size: " << NT << endl << endl; 76 | 77 | 78 | ofstream datafile; 79 | if (SOLVER == RK4){ 80 | datafile.open ("./data/cpp/MPGOS_times_unadaptive.txt",ios::app); 81 | datafile << NT << "\t"<< 1000.0*(SimulationEnd-SimulationStart) / CLOCKS_PER_SEC <<"\n"; 82 | datafile.close(); 83 | }else{ 84 | 85 | datafile.open ("./data/cpp/MPGOS_times_adaptive.txt",ios::app); 86 | datafile << NT << "\t"<< 1000.0*(SimulationEnd-SimulationStart) / CLOCKS_PER_SEC <<"\n"; 87 | datafile.close(); 88 | } 89 | 90 | //SaveData(ScanLorenz, NT); 91 | 92 | cout << "Test finished!" << endl; 93 | } 94 | 95 | // AUXILIARY FUNCTION ----------------------------------------------------------------------------- 96 | 97 | void Linspace(vector& x, PRECISION B, PRECISION E, int N) 98 | { 99 | PRECISION Increment; 100 | 101 | x[0] = B; 102 | 103 | if ( N>1 ) 104 | { 105 | x[N-1] = E; 106 | Increment = (E-B)/(N-1); 107 | 108 | for (int i=1; i& Solver, const vector& R_Values, int NumberOfThreads) 116 | { 117 | int ProblemNumber = 0; 118 | for (int k=0; k& Solver, int NumberOfThreads) 134 | { 135 | ofstream DataFile; 136 | DataFile.open ( "Lorenz.txt" ); 137 | 138 | int Width = 18; 139 | DataFile.precision(10); 140 | DataFile.flags(ios::scientific); 141 | 142 | for (int tid=0; tid(tid, ControlParameters, 0) << ','; 145 | DataFile.width(Width); DataFile << Solver.GetHost(tid, ActualState, 0) << ','; 146 | DataFile.width(Width); DataFile << Solver.GetHost(tid, ActualState, 1) << ','; 147 | DataFile.width(Width); DataFile << Solver.GetHost(tid, ActualState, 2); 148 | DataFile << '\n'; 149 | } 150 | 151 | DataFile.close(); 152 | } 153 | -------------------------------------------------------------------------------- /GPU_ODE_PyTorch/environment.yml: -------------------------------------------------------------------------------- 1 | name: venv_torch 2 | channels: 3 | - nvidia/label/cuda-11.6.0 4 | - conda-forge 5 | - defaults 6 | - anaconda 7 | dependencies: 8 | - _libgcc_mutex=0.1=conda_forge 9 | - _openmp_mutex=4.5=2_kmp_llvm 10 | - appdirs=1.4.4=pyh9f0ad1d_0 11 | - asttokens=2.2.1=pyhd8ed1ab_0 12 | - backcall=0.2.0=pyh9f0ad1d_0 13 | - backports=1.0=pyhd8ed1ab_3 14 | - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0 15 | - blas=1.0=mkl 16 | - boost=1.78.0=py310hc4a4660_4 17 | - boost-cpp=1.78.0=h75c5d50_1 18 | - bzip2=1.0.8=h7f98852_4 19 | - ca-certificates=2022.12.7=ha878542_0 20 | - certifi=2022.12.7=pyhd8ed1ab_0 21 | - comm=0.1.2=pyhd8ed1ab_0 22 | - cuda-cccl=11.6.55=hf6102b2_0 23 | - cuda-command-line-tools=11.6.0=0 24 | - cuda-compiler=11.6.0=0 25 | - cuda-cudart=11.6.55=he381448_0 26 | - cuda-cudart-dev=11.6.55=h42ad0f4_0 27 | - cuda-cuobjdump=11.6.55=h9dd2d0c_0 28 | - cuda-cupti=11.6.55=h43c5c43_0 29 | - cuda-cuxxfilt=11.6.55=h69de05d_0 30 | - cuda-driver-dev=11.6.55=0 31 | - cuda-gdb=11.6.55=hff0b7d4_0 32 | - cuda-libraries=11.6.0=0 33 | - cuda-libraries-dev=11.6.0=0 34 | - cuda-memcheck=11.6.55=h0288dce_0 35 | - cuda-nsight=11.6.55=0 36 | - cuda-nsight-compute=11.6.0=0 37 | - cuda-nvcc=11.6.55=h5758ece_0 38 | - cuda-nvdisasm=11.6.55=h5556c0d_0 39 | - cuda-nvml-dev=11.6.55=haa9ef22_0 40 | - cuda-nvprof=11.6.55=h30b2dac_0 41 | - cuda-nvprune=11.6.55=h3791f62_0 42 | - cuda-nvrtc=11.6.55=hc54fff9_0 43 | - cuda-nvrtc-dev=11.6.55=h42ad0f4_0 44 | - cuda-nvtx=11.6.55=h99d0529_0 45 | - cuda-nvvp=11.6.58=h67ee751_0 46 | - cuda-samples=11.6.101=h8efea70_0 47 | - cuda-sanitizer-api=11.6.55=h4716e2e_0 48 | - cuda-toolkit=11.6.0=0 49 | - cuda-tools=11.6.0=0 50 | - cuda-visual-tools=11.6.0=0 51 | - cudatoolkit=11.8.0=h37601d7_11 52 | - debugpy=1.6.6=py310heca2aa9_0 53 | - decorator=5.1.1=pyhd8ed1ab_0 54 | - executing=1.2.0=pyhd8ed1ab_0 55 | - gds-tools=1.2.0.100=0 56 | - icu=70.1=h27087fc_0 57 | - importlib-metadata=6.0.0=pyha770c72_0 58 | - importlib_metadata=6.0.0=hd8ed1ab_0 59 | - ipykernel=6.19.2=py310h2f386ee_0 60 | - ipython=8.10.0=pyh41d4057_0 61 | - jedi=0.18.2=pyhd8ed1ab_0 62 | - jupyter_client=8.0.3=pyhd8ed1ab_0 63 | - jupyter_core=5.2.0=py310hff52083_0 64 | - ld_impl_linux-64=2.40=h41732ed_0 65 | - libcublas=11.8.1.74=h1e58c10_0 66 | - libcublas-dev=11.8.1.74=h7a51e1f_0 67 | - libcufft=10.7.0.55=h563f203_0 68 | - libcufft-dev=10.7.0.55=h05eb8d0_0 69 | - libcufile=1.2.0.100=0 70 | - libcufile-dev=1.2.0.100=0 71 | - libcurand=10.2.9.55=h7c349da_0 72 | - libcurand-dev=10.2.9.55=hd2e71f0_0 73 | - libcusolver=11.3.2.55=hebb49eb_0 74 | - libcusparse=11.7.1.55=h9a152cf_0 75 | - libffi=3.4.2=h7f98852_5 76 | - libgcc-ng=12.2.0=h65d4601_19 77 | - libhwloc=2.9.0=hd6dc26d_0 78 | - libiconv=1.17=h166bdaf_0 79 | - libnpp=11.6.0.55=hdb0c674_0 80 | - libnpp-dev=11.6.0.55=h0163868_0 81 | - libnsl=2.0.0=h7f98852_0 82 | - libnvjpeg=11.6.0.55=h6f17e28_0 83 | - libnvjpeg-dev=11.6.0.55=h0163868_0 84 | - libsodium=1.0.18=h36c2ea0_1 85 | - libsqlite=3.40.0=h753d276_0 86 | - libstdcxx-ng=12.2.0=h46fd767_19 87 | - libuuid=2.32.1=h7f98852_1000 88 | - libxml2=2.10.3=h7463322_0 89 | - libzlib=1.2.13=h166bdaf_4 90 | - llvm-openmp=15.0.7=h0cdce71_0 91 | - mako=1.2.4=pyhd8ed1ab_0 92 | - markupsafe=2.1.2=py310h1fa729e_0 93 | - matplotlib-inline=0.1.6=pyhd8ed1ab_0 94 | - mkl=2021.4.0=h8d4b97c_729 95 | - mkl-service=2.4.0=py310ha2c4b55_0 96 | - mkl_fft=1.3.1=py310h2b4bcf5_1 97 | - mkl_random=1.2.2=py310h00e6091_0 98 | - ncurses=6.3=h27087fc_1 99 | - nest-asyncio=1.5.6=pyhd8ed1ab_0 100 | - nsight-compute=2022.1.0.12=0 101 | - numpy=1.23.4=py310hd5efca6_0 102 | - numpy-base=1.23.4=py310h8e6c178_0 103 | - openssl=3.1.0=h0b41bf4_0 104 | - packaging=23.0=pyhd8ed1ab_0 105 | - parso=0.8.3=pyhd8ed1ab_0 106 | - pexpect=4.8.0=pyh1a96a4e_2 107 | - pickleshare=0.7.5=py_1003 108 | - pip=22.2.2=py310h06a4308_0 109 | - platformdirs=3.0.0=pyhd8ed1ab_0 110 | - prompt-toolkit=3.0.36=pyha770c72_0 111 | - psutil=5.9.4=py310h5764c6d_0 112 | - ptyprocess=0.7.0=pyhd3deb0d_0 113 | - pure_eval=0.2.2=pyhd8ed1ab_0 114 | - pycuda=2022.2.2=py310h8981878_0 115 | - pygments=2.14.0=pyhd8ed1ab_0 116 | - python=3.10.9=he550d4f_0_cpython 117 | - python-dateutil=2.8.2=pyhd8ed1ab_0 118 | - python_abi=3.10=3_cp310 119 | - pytools=2022.1.14=pyhd8ed1ab_0 120 | - pyzmq=25.0.0=py310h059b190_0 121 | - readline=8.1.2=h0f457ee_0 122 | - setuptools=67.4.0=pyhd8ed1ab_0 123 | - six=1.16.0=pyh6c4a22f_0 124 | - stack_data=0.6.2=pyhd8ed1ab_0 125 | - tbb=2021.8.0=hf52228f_0 126 | - tk=8.6.12=h27826a3_0 127 | - tornado=6.2=py310h5764c6d_1 128 | - traitlets=5.9.0=pyhd8ed1ab_0 129 | - typing-extensions=4.4.0=hd8ed1ab_0 130 | - typing_extensions=4.4.0=pyha770c72_0 131 | - tzdata=2022g=h191b570_0 132 | - wcwidth=0.2.6=pyhd8ed1ab_0 133 | - wheel=0.38.4=pyhd8ed1ab_0 134 | - xz=5.2.6=h166bdaf_0 135 | - zeromq=4.3.4=h9c3ff4c_1 136 | - zipp=3.15.0=pyhd8ed1ab_0 137 | - zstd=1.5.2=h3eb15da_6 138 | - pip: 139 | - charset-normalizer==3.0.1 140 | - cmake==3.25.0 141 | - filelock==3.9.0 142 | - functorch==1.13.0 143 | - idna==3.4 144 | - mpmath==1.2.1 145 | - networkx==3.0rc1 146 | - nvidia-cublas-cu11==11.10.3.66 147 | - nvidia-cuda-nvrtc-cu11==11.7.99 148 | - nvidia-cuda-runtime-cu11==11.7.99 149 | - nvidia-cudnn-cu11==8.5.0.96 150 | - pillow==9.4.0 151 | - pytorch-triton==2.0.0+0d7e753227 152 | - requests==2.28.2 153 | - scipy==1.10.1 154 | - sympy==1.11.1 155 | - torch==2.0.0.dev20230202+cu116 156 | - torchaudio==2.0.0.dev20230201+cu116 157 | - torchdiffeq==0.2.3 158 | - torchode==0.1.1.post1 159 | - torchtyping==0.1.4 160 | - torchvision==0.15.0.dev20230201+cu116 161 | - typeguard==2.13.3 162 | - urllib3==1.26.14 163 | -------------------------------------------------------------------------------- /GPU_ODE_Julia/bin/ode_problems/pollu.jl: -------------------------------------------------------------------------------- 1 | using GPU_ODE_Julia 2 | using DiffEqGPU, BenchmarkTools, StaticArrays, SimpleDiffEq 3 | using CUDA 4 | 5 | @show ARGS 6 | #settings 7 | 8 | numberOfParameters = isinteractive() ? 8388608 : parse(Int64, ARGS[1]) 9 | gpuID = 0 10 | 11 | T = Float64 12 | 13 | const k1 = T(.35e0) 14 | const k2 = T(.266e2) 15 | const k3 = T(.123e5) 16 | const k4 = T(.86e-3) 17 | const k5 = T(.82e-3) 18 | const k6 = T(.15e5) 19 | const k7 = T(.13e-3) 20 | const k8 = T(.24e5) 21 | const k9 = T(.165e5) 22 | const k10 = T(.9e4) 23 | const k11 = T(.22e-1) 24 | const k12 = T(.12e5) 25 | const k13 = T(.188e1) 26 | const k14 = T(.163e5) 27 | const k15 = T(.48e7) 28 | const k16 = T(.35e-3) 29 | const k17 = T(.175e-1) 30 | const k18 = T(.1e9) 31 | const k19 = T(.444e12) 32 | const k20 = T(.124e4) 33 | const k21 = T(.21e1) 34 | const k22 = T(.578e1) 35 | const k23 = T(.474e-1) 36 | const k24 = T(.178e4) 37 | const k25 = T(.312e1) 38 | 39 | function f(dy::AbstractArray{T}, y::AbstractArray{T}, p, t) where {T} 40 | r1 = k1 * y[1] 41 | r2 = k2 * y[2] * y[4] 42 | r3 = k3 * y[5] * y[2] 43 | r4 = k4 * y[7] 44 | r5 = k5 * y[7] 45 | r6 = k6 * y[7] * y[6] 46 | r7 = k7 * y[9] 47 | r8 = k8 * y[9] * y[6] 48 | r9 = k9 * y[11] * y[2] 49 | r10 = k10 * y[11] * y[1] 50 | r11 = k11 * y[13] 51 | r12 = k12 * y[10] * y[2] 52 | r13 = k13 * y[14] 53 | r14 = k14 * y[1] * y[6] 54 | r15 = k15 * y[3] 55 | r16 = k16 * y[4] 56 | r17 = k17 * y[4] 57 | r18 = k18 * y[16] 58 | r19 = k19 * y[16] 59 | r20 = k20 * y[17] * y[6] 60 | r21 = k21 * y[19] 61 | r22 = k22 * y[19] 62 | r23 = k23 * y[1] * y[4] 63 | r24 = k24 * y[19] * y[1] 64 | r25 = k25 * y[20] 65 | 66 | dy[1] = -r1 - r10 - r14 - r23 - r24 + 67 | r2 + r3 + r9 + r11 + r12 + r22 + r25 68 | dy[2] = -r2 - r3 - r9 - r12 + r1 + r21 69 | dy[3] = -r15 + r1 + r17 + r19 + r22 70 | dy[4] = -r2 - r16 - r17 - r23 + r15 71 | dy[5] = -r3 + r4 + r4 + r6 + r7 + r13 + r20 72 | dy[6] = -r6 - r8 - r14 - r20 + r3 + r18 + r18 73 | dy[7] = -r4 - r5 - r6 + r13 74 | dy[8] = r4 + r5 + r6 + r7 75 | dy[9] = -r7 - r8 76 | dy[10] = -r12 + r7 + r9 77 | dy[11] = -r9 - r10 + r8 + r11 78 | dy[12] = r9 79 | dy[13] = -r11 + r10 80 | dy[14] = -r13 + r12 81 | dy[15] = r14 82 | dy[16] = -r18 - r19 + r16 83 | dy[17] = -r20 84 | dy[18] = r20 85 | dy[19] = -r21 - r22 - r24 + r23 + r25 86 | dy[20] = -r25 + r24 87 | end 88 | 89 | function fjac(J, y, p, t) 90 | J .= 0.0 91 | J[1, 1] = -k1 - k10 * y[11] - k14 * y[6] - k23 * y[4] - k24 * y[19] 92 | J[1, 11] = -k10 * y[1] + k9 * y[2] 93 | J[1, 6] = -k14 * y[1] 94 | J[1, 4] = -k23 * y[1] + k2 * y[2] 95 | J[1, 19] = -k24 * y[1] + k22 96 | J[1, 2] = k2 * y[4] + k9 * y[11] + k3 * y[5] + k12 * y[10] 97 | J[1, 13] = k11 98 | J[1, 20] = k25 99 | J[1, 5] = k3 * y[2] 100 | J[1, 10] = k12 * y[2] 101 | 102 | J[2, 4] = -k2 * y[2] 103 | J[2, 5] = -k3 * y[2] 104 | J[2, 11] = -k9 * y[2] 105 | J[2, 10] = -k12 * y[2] 106 | J[2, 19] = k21 107 | J[2, 1] = k1 108 | J[2, 2] = -k2 * y[4] - k3 * y[5] - k9 * y[11] - k12 * y[10] 109 | 110 | J[3, 1] = k1 111 | J[3, 4] = k17 112 | J[3, 16] = k19 113 | J[3, 19] = k22 114 | J[3, 3] = -k15 115 | 116 | J[4, 4] = -k2 * y[2] - k16 - k17 - k23 * y[1] 117 | J[4, 2] = -k2 * y[4] 118 | J[4, 1] = -k23 * y[4] 119 | J[4, 3] = k15 120 | 121 | J[5, 5] = -k3 * y[2] 122 | J[5, 2] = -k3 * y[5] 123 | J[5, 7] = 2k4 + k6 * y[6] 124 | J[5, 6] = k6 * y[7] + k20 * y[17] 125 | J[5, 9] = k7 126 | J[5, 14] = k13 127 | J[5, 17] = k20 * y[6] 128 | 129 | J[6, 6] = -k6 * y[7] - k8 * y[9] - k14 * y[1] - k20 * y[17] 130 | J[6, 7] = -k6 * y[6] 131 | J[6, 9] = -k8 * y[6] 132 | J[6, 1] = -k14 * y[6] 133 | J[6, 17] = -k20 * y[6] 134 | J[6, 2] = k3 * y[5] 135 | J[6, 5] = k3 * y[2] 136 | J[6, 16] = 2k18 137 | 138 | J[7, 7] = -k4 - k5 - k6 * y[6] 139 | J[7, 6] = -k6 * y[7] 140 | J[7, 14] = k13 141 | 142 | J[8, 7] = k4 + k5 + k6 * y[6] 143 | J[8, 6] = k6 * y[7] 144 | J[8, 9] = k7 145 | 146 | J[9, 9] = -k7 - k8 * y[6] 147 | J[9, 6] = -k8 * y[9] 148 | 149 | J[10, 10] = -k12 * y[2] 150 | J[10, 2] = -k12 * y[10] + k9 * y[11] 151 | J[10, 9] = k7 152 | J[10, 11] = k9 * y[2] 153 | 154 | J[11, 11] = -k9 * y[2] - k10 * y[1] 155 | J[11, 2] = -k9 * y[11] 156 | J[11, 1] = -k10 * y[11] 157 | J[11, 9] = k8 * y[6] 158 | J[11, 6] = k8 * y[9] 159 | J[11, 13] = k11 160 | 161 | J[12, 11] = k9 * y[2] 162 | J[12, 2] = k9 * y[11] 163 | 164 | J[13, 13] = -k11 165 | J[13, 11] = k10 * y[1] 166 | J[13, 1] = k10 * y[11] 167 | 168 | J[14, 14] = -k13 169 | J[14, 10] = k12 * y[2] 170 | J[14, 2] = k12 * y[10] 171 | 172 | J[15, 1] = k14 * y[6] 173 | J[15, 6] = k14 * y[1] 174 | 175 | J[16, 16] = -k18 - k19 176 | J[16, 4] = k16 177 | 178 | J[17, 17] = -k20 * y[6] 179 | J[17, 6] = -k20 * y[17] 180 | 181 | J[18, 17] = k20 * y[6] 182 | J[18, 6] = k20 * y[17] 183 | 184 | J[19, 19] = -k21 - k22 - k24 * y[1] 185 | J[19, 1] = -k24 * y[19] + k23 * y[4] 186 | J[19, 4] = k23 * y[1] 187 | J[19, 20] = k25 188 | 189 | J[20, 20] = -k25 190 | J[20, 1] = k24 * y[19] 191 | J[20, 19] = k24 * y[1] 192 | 193 | return 194 | end 195 | 196 | u0 = zeros(20) 197 | u0[2] = 0.2 198 | u0[4] = 0.04 199 | u0[7] = 0.1 200 | u0[8] = 0.3 201 | u0[9] = 0.01 202 | u0[17] = 0.007 203 | oprob = ODEProblem(f, u0, (T(0.0), T(60.0))) 204 | 205 | prob = make_gpu_compatible(oprob, Val(T)) 206 | 207 | @assert prob.f(prob.u0, prob.p, T(1.0f0)) isa StaticArray{<:Tuple, T} 208 | 209 | ensembleProb = EnsembleProblem(prob) 210 | 211 | sol = solve(ensembleProb, GPUTsit5(), EnsembleGPUKernel(), trajectories = 2, dt = 1.0f0) 212 | 213 | ### Lower level API #### 214 | 215 | ## Building problems here only 216 | I = 1:numberOfParameters 217 | if ensembleProb.safetycopy 218 | probs = map(I) do i 219 | ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1) 220 | end 221 | else 222 | probs = map(I) do i 223 | ensembleProb.prob_func(ensembleProb.prob, i, 1) 224 | end 225 | end 226 | 227 | ## Make them compatible with CUDA 228 | probs = cu(probs) 229 | 230 | @info "Solving the problem" 231 | sol = @time CUDA.@sync DiffEqGPU.vectorized_asolve(probs, ensembleProb.prob, GPUTsit5(); 232 | save_everystep = false, dt = 0.001f0) 233 | 234 | # sol = solve(monteprob, GPUTsit5(), EnsembleGPUKernel(), trajectories = 2, dt = 1.0f0) 235 | -------------------------------------------------------------------------------- /GPU_ODE_MPGOS/SourceCodes/SingleSystem_PerThread_ExplicitRungeKutta_Steppers.cuh: -------------------------------------------------------------------------------- 1 | #ifndef SINGLESYSTEM_PERTHREAD_EXPLICITRUNGEKUTTA_STEPPERS_H 2 | #define SINGLESYSTEM_PERTHREAD_EXPLICITRUNGEKUTTA_STEPPERS_H 3 | 4 | 5 | // RK4 ------------------------------------------------------------------------ 6 | template 7 | __forceinline__ __device__ void PerThread_Stepper_RK4(\ 8 | int tid, \ 9 | Precision r_ActualTime, \ 10 | Precision r_TimeStep, \ 11 | Precision* r_ActualState, \ 12 | Precision* r_NextState, \ 13 | Precision* r_Error, \ 14 | int& r_IsFinite, \ 15 | Precision* r_ControlParameters, \ 16 | Precision* gs_SharedParameters, \ 17 | int* gs_IntegerSharedParameters, \ 18 | Precision* r_Accessories, \ 19 | int* r_IntegerAccessories) 20 | { 21 | // MEMORY MANAGEMENT ------------------------------------------------------ 22 | Precision X[SD]; 23 | Precision k1[SD]; 24 | 25 | Precision T; 26 | Precision dTp2 = static_cast(0.5) * r_TimeStep; 27 | Precision dTp6 = static_cast(1.0/6.0) * r_TimeStep; 28 | 29 | 30 | // K1 --------------------------------------------------------------------- 31 | PerThread_OdeFunction(\ 32 | tid, \ 33 | NT, \ 34 | r_NextState, \ 35 | r_ActualState, \ 36 | r_ActualTime, \ 37 | r_ControlParameters, \ 38 | gs_SharedParameters, \ 39 | gs_IntegerSharedParameters, \ 40 | r_Accessories, \ 41 | r_IntegerAccessories); 42 | 43 | 44 | // K2 --------------------------------------------------------------------- 45 | //printf("time: %f\n",r_ActualTime); 46 | T = r_ActualTime + dTp2; 47 | 48 | #pragma unroll 49 | for (int i=0; i(2.0)*k1[i]; 70 | X[i] = r_ActualState[i] + k1[i] * dTp2; 71 | } 72 | 73 | PerThread_OdeFunction(\ 74 | tid, \ 75 | NT, \ 76 | k1, \ 77 | X, \ 78 | T, \ 79 | r_ControlParameters, \ 80 | gs_SharedParameters, \ 81 | gs_IntegerSharedParameters, \ 82 | r_Accessories, \ 83 | r_IntegerAccessories); 84 | 85 | 86 | // K4 --------------------------------------------------------------------- 87 | T = r_ActualTime + r_TimeStep; 88 | 89 | #pragma unroll 90 | for (int i=0; i(2.0)*k1[i]; 93 | X[i] = r_ActualState[i] + k1[i] * r_TimeStep; 94 | } 95 | 96 | PerThread_OdeFunction(\ 97 | tid, \ 98 | NT, \ 99 | k1, \ 100 | X, \ 101 | T, \ 102 | r_ControlParameters, \ 103 | gs_SharedParameters, \ 104 | gs_IntegerSharedParameters, \ 105 | r_Accessories, \ 106 | r_IntegerAccessories); 107 | 108 | 109 | // NEW STATE -------------------------------------------------------------- 110 | #pragma unroll 111 | for (int i=0; i 123 | __forceinline__ __device__ void PerThread_Stepper_RKCK45(\ 124 | int tid, \ 125 | Precision r_ActualTime, \ 126 | Precision r_TimeStep, \ 127 | Precision* r_ActualState, \ 128 | Precision* r_NextState, \ 129 | Precision* r_Error, \ 130 | int& r_IsFinite, \ 131 | Precision* r_ControlParameters, \ 132 | Precision* gs_SharedParameters, \ 133 | int* gs_IntegerSharedParameters, \ 134 | Precision* r_Accessories, \ 135 | int* r_IntegerAccessories) 136 | { 137 | // MEMORY MANAGEMENT ------------------------------------------------------ 138 | Precision X[SD]; 139 | Precision T; 140 | 141 | Precision k1[SD]; 142 | Precision k2[SD]; 143 | Precision k3[SD]; 144 | Precision k4[SD]; 145 | Precision k5[SD]; 146 | Precision k6[SD]; 147 | 148 | 149 | // K1 --------------------------------------------------------------------- 150 | PerThread_OdeFunction(\ 151 | tid, \ 152 | NT, \ 153 | k1, \ 154 | r_ActualState, \ 155 | r_ActualTime, \ 156 | r_ControlParameters, \ 157 | gs_SharedParameters, \ 158 | gs_IntegerSharedParameters, \ 159 | r_Accessories, \ 160 | r_IntegerAccessories); 161 | 162 | 163 | // K2 --------------------------------------------------------------------- 164 | //printf("time: %f\n",r_ActualTime); 165 | T = r_ActualTime + r_TimeStep * static_cast(1.0/5.0); 166 | 167 | #pragma unroll 168 | for (int i=0; i(1.0/5.0) * k1[i] ); 170 | 171 | PerThread_OdeFunction(\ 172 | tid, \ 173 | NT, \ 174 | k2, \ 175 | X, \ 176 | T, \ 177 | r_ControlParameters, \ 178 | gs_SharedParameters, \ 179 | gs_IntegerSharedParameters, \ 180 | r_Accessories, \ 181 | r_IntegerAccessories); 182 | 183 | 184 | // K3 --------------------------------------------------------------------- 185 | T = r_ActualTime + r_TimeStep * static_cast(3.0/10.0); 186 | 187 | #pragma unroll 188 | for (int i=0; i(3.0/40.0) * k1[i] + \ 190 | static_cast(9.0/40.0) * k2[i] ); 191 | 192 | PerThread_OdeFunction(tid, \ 193 | NT, \ 194 | k3, \ 195 | X, \ 196 | T, \ 197 | r_ControlParameters, \ 198 | gs_SharedParameters, \ 199 | gs_IntegerSharedParameters, \ 200 | r_Accessories, \ 201 | r_IntegerAccessories); 202 | 203 | 204 | // K4 --------------------------------------------------------------------- 205 | T = r_ActualTime + r_TimeStep * static_cast(3.0/5.0); 206 | 207 | #pragma unroll 208 | for (int i=0; i(3.0/10.0) * k1[i] + \ 210 | static_cast(-9.0/10.0) * k2[i] + \ 211 | static_cast(6.0/5.0) * k3[i] ); 212 | 213 | PerThread_OdeFunction(\ 214 | tid, \ 215 | NT, \ 216 | k4, \ 217 | X, \ 218 | T, \ 219 | r_ControlParameters, \ 220 | gs_SharedParameters, \ 221 | gs_IntegerSharedParameters, \ 222 | r_Accessories, \ 223 | r_IntegerAccessories); 224 | 225 | 226 | // K5 --------------------------------------------------------------------- 227 | T = r_ActualTime + r_TimeStep; 228 | 229 | #pragma unroll 230 | for (int i=0; i(-11.0/54.0) * k1[i] + \ 232 | static_cast(5.0/2.0) * k2[i] + \ 233 | static_cast(-70.0/27.0) * k3[i] + \ 234 | static_cast(35.0/27.0) * k4[i] ); 235 | 236 | PerThread_OdeFunction(\ 237 | tid, \ 238 | NT, \ 239 | k5, \ 240 | X, \ 241 | T, \ 242 | r_ControlParameters, \ 243 | gs_SharedParameters, \ 244 | gs_IntegerSharedParameters, \ 245 | r_Accessories, \ 246 | r_IntegerAccessories); 247 | 248 | 249 | // K6 --------------------------------------------------------------------- 250 | T = r_ActualTime + r_TimeStep * static_cast(7.0/8.0); 251 | 252 | #pragma unroll 253 | for (int i=0; i(1631.0/55296.0) * k1[i] + \ 255 | static_cast(175.0/512.0) * k2[i] + \ 256 | static_cast(575.0/13824.0) * k3[i] + \ 257 | static_cast(44275.0/110592.0) * k4[i] + \ 258 | static_cast(253.0/4096.0) * k5[i] ); 259 | 260 | PerThread_OdeFunction(\ 261 | tid, \ 262 | NT, \ 263 | k6, \ 264 | X, \ 265 | T, \ 266 | r_ControlParameters, \ 267 | gs_SharedParameters, \ 268 | gs_IntegerSharedParameters, \ 269 | r_Accessories, \ 270 | r_IntegerAccessories); 271 | 272 | 273 | // NEW STATE AND ERROR ---------------------------------------------------- 274 | #pragma unroll 275 | for (int i=0; i(37.0/378.0) * k1[i] + \ 278 | static_cast(250.0/621.0) * k3[i] + \ 279 | static_cast(125.0/594.0) * k4[i] + \ 280 | static_cast(512.0/1771.0) * k6[i] ); 281 | 282 | r_Error[i] = static_cast( 37.0/378.0 - 2825.0/27648.0 ) * k1[i] + \ 283 | static_cast( 250.0/621.0 - 18575.0/48384.0 ) * k3[i] + \ 284 | static_cast( 125.0/594.0 - 13525.0/55296.0 ) * k4[i] + \ 285 | static_cast( 0.0 - 277.0/14336.0 ) * k5[i] + \ 286 | static_cast( 512.0/1771.0 - 1.0/4.0 ) * k6[i]; 287 | r_Error[i] = r_TimeStep * abs( r_Error[i] ) + 1e-18; 288 | 289 | if ( ( isfinite( r_NextState[i] ) == 0 ) || ( isfinite( r_Error[i] ) == 0 ) ) 290 | r_IsFinite = 0; 291 | } 292 | } 293 | 294 | #endif -------------------------------------------------------------------------------- /GPU_ODE_MPGOS/SourceCodes/CoupledSystems_PerBlock_EventHandling.cuh: -------------------------------------------------------------------------------- 1 | #ifndef COUPLEDSYSTEMS_PERBLOCK_EVENTHANDLING_H 2 | #define COUPLEDSYSTEMS_PERBLOCK_EVENTHANDLING_H 3 | 4 | 5 | template 6 | __forceinline__ __device__ void CoupledSystems_PerBlock_MultipleSystems_MultipleBlockLaunches_EventTimeStepControl(\ 7 | Precision* s_TimeStep, \ 8 | Precision* s_NewTimeStep, \ 9 | int* s_TerminateSystemScope, \ 10 | int* s_UpdateStep, \ 11 | Precision r_ActualEventValue[(NE==0?1:NBL)][(NE==0?1:NE)], \ 12 | Precision r_NextEventValue[(NE==0?1:NBL)][(NE==0?1:NE)], \ 13 | Precision* s_EventTolerance, \ 14 | int* s_EventDirection, \ 15 | Precision MinimumTimeStep) 16 | { 17 | int LocalThreadID_GPU = threadIdx.x; 18 | int BlockID = blockIdx.x; 19 | int LocalThreadID_Logical; 20 | int LocalSystemID; 21 | int GlobalSystemID; 22 | 23 | __shared__ Precision s_EventTimeStep[SPB]; 24 | __shared__ int s_IsCorrected[SPB]; 25 | 26 | // Event time step initialisation 27 | int Launches = SPB / blockDim.x + (SPB % blockDim.x == 0 ? 0 : 1); 28 | for (int j=0; j s_EventTolerance[i] ) && ( r_NextEventValue[BL][i] < -s_EventTolerance[i] ) && ( s_EventDirection[i] <= 0 ) ) || \ 51 | ( ( r_ActualEventValue[BL][i] < -s_EventTolerance[i] ) && ( r_NextEventValue[BL][i] > s_EventTolerance[i] ) && ( s_EventDirection[i] >= 0 ) ) ) 52 | { 53 | MPGOS::atomicFMIN(&(s_EventTimeStep[LocalSystemID]), -r_ActualEventValue[BL][i] / (r_NextEventValue[BL][i]-r_ActualEventValue[BL][i]) * s_TimeStep[LocalSystemID]); 54 | atomicMax(&(s_IsCorrected[LocalSystemID]), 1); 55 | } 56 | } 57 | } 58 | } 59 | __syncthreads(); 60 | 61 | // Corrected time step and modified update 62 | for (int j=0; j 84 | __forceinline__ __device__ void CoupledSystems_PerBlock_SingleSystem_MultipleBlockLaunches_EventTimeStepControl(\ 85 | Precision s_TimeStep, \ 86 | Precision& s_NewTimeStep, \ 87 | int s_TerminateSystemScope, \ 88 | int& s_UpdateStep, \ 89 | Precision r_ActualEventValue[(NE==0?1:NBL)][(NE==0?1:NE)], \ 90 | Precision r_NextEventValue[(NE==0?1:NBL)][(NE==0?1:NE)], \ 91 | Precision* s_EventTolerance, \ 92 | int* s_EventDirection, \ 93 | Precision MinimumTimeStep) 94 | { 95 | const int LocalThreadID_GPU = threadIdx.x; 96 | const int GlobalSystemID = blockIdx.x; 97 | 98 | int UnitID; 99 | 100 | 101 | __shared__ Precision s_EventTimeStep; 102 | __shared__ int s_IsCorrected; 103 | 104 | // Event time step initialisation 105 | if ( threadIdx.x == 0 ) 106 | { 107 | s_EventTimeStep = s_TimeStep; 108 | s_IsCorrected = 0; 109 | } 110 | __syncthreads(); 111 | 112 | // Event time step correction 113 | for (int BL=0; BL s_EventTolerance[i] ) && ( r_NextEventValue[BL][i] < -s_EventTolerance[i] ) && ( s_EventDirection[i] <= 0 ) ) || \ 122 | ( ( r_ActualEventValue[BL][i] < -s_EventTolerance[i] ) && ( r_NextEventValue[BL][i] > s_EventTolerance[i] ) && ( s_EventDirection[i] >= 0 ) ) ) 123 | { 124 | MPGOS::atomicFMIN(&s_EventTimeStep, -r_ActualEventValue[BL][i] / (r_NextEventValue[BL][i]-r_ActualEventValue[BL][i]) * s_TimeStep); 125 | atomicMax(&s_IsCorrected, 1); 126 | } 127 | } 128 | } 129 | } 130 | __syncthreads(); 131 | 132 | // Corrected time step and modified update 133 | if ( ( threadIdx.x == 0 ) && ( s_IsCorrected == 1 ) ) 134 | { 135 | if ( s_EventTimeStep < MinimumTimeStep ) 136 | { 137 | printf("Warning: Event cannot be detected without reducing the step size below the minimum! Event detection omitted!, (global system id: %d)\n", GlobalSystemID); 138 | } else 139 | { 140 | s_NewTimeStep = s_EventTimeStep; 141 | s_UpdateStep = 0; 142 | } 143 | } 144 | __syncthreads(); 145 | } 146 | 147 | 148 | template 149 | __forceinline__ __device__ void CoupledSystems_PerBlock_MultipleSystems_SingleBlockLaunch_EventTimeStepControl(\ 150 | const int LocalSystemID, \ 151 | Precision* s_TimeStep, \ 152 | Precision* s_NewTimeStep, \ 153 | int* s_TerminateSystemScope, \ 154 | int* s_UpdateStep, \ 155 | Precision r_ActualEventValue[(NE==0?1:NE)], \ 156 | Precision r_NextEventValue[(NE==0?1:NE)], \ 157 | Precision* s_EventTolerance, \ 158 | int* s_EventDirection, \ 159 | Precision MinimumTimeStep) 160 | { 161 | __shared__ Precision s_EventTimeStep[SPB]; 162 | __shared__ int s_IsCorrected[SPB]; 163 | 164 | // Event time step initialisation 165 | int Launches = SPB / blockDim.x + (SPB % blockDim.x == 0 ? 0 : 1); 166 | for (int j=0; j s_EventTolerance[i] ) && ( r_NextEventValue[i] < -s_EventTolerance[i] ) && ( s_EventDirection[i] <= 0 ) ) || \ 184 | ( ( r_ActualEventValue[i] < -s_EventTolerance[i] ) && ( r_NextEventValue[i] > s_EventTolerance[i] ) && ( s_EventDirection[i] >= 0 ) ) ) 185 | { 186 | MPGOS::atomicFMIN(&(s_EventTimeStep[LocalSystemID]), -r_ActualEventValue[i] / (r_NextEventValue[i]-r_ActualEventValue[i]) * s_TimeStep[LocalSystemID]); 187 | atomicMax(&(s_IsCorrected[LocalSystemID]), 1); 188 | } 189 | } 190 | } 191 | __syncthreads(); 192 | 193 | // Corrected time step and modified update 194 | for (int j=0; j 216 | __forceinline__ __device__ void CoupledSystems_PerBlock_SingleSystem_SingleBlockLaunch_EventTimeStepControl(\ 217 | const int GlobalSystemID, \ 218 | const int LocalThreadID, \ 219 | Precision s_TimeStep, \ 220 | Precision& s_NewTimeStep, \ 221 | int s_TerminateSystemScope, \ 222 | int& s_UpdateStep, \ 223 | Precision r_ActualEventValue[(NE==0?1:NE)], \ 224 | Precision r_NextEventValue[(NE==0?1:NE)], \ 225 | Precision* s_EventTolerance, \ 226 | int* s_EventDirection, \ 227 | Precision MinimumTimeStep) 228 | { 229 | __shared__ Precision s_EventTimeStep; 230 | __shared__ int s_IsCorrected; 231 | 232 | // Event time step initialisation 233 | if ( threadIdx.x == 0 ) 234 | { 235 | s_EventTimeStep = s_TimeStep; 236 | s_IsCorrected = 0; 237 | } 238 | __syncthreads(); 239 | 240 | // Event time step correction 241 | if ( ( LocalThreadID < UPS ) && ( s_UpdateStep == 1 ) && ( s_TerminateSystemScope == 0 ) ) 242 | { 243 | for (int i=0; i s_EventTolerance[i] ) && ( r_NextEventValue[i] < -s_EventTolerance[i] ) && ( s_EventDirection[i] <= 0 ) ) || \ 246 | ( ( r_ActualEventValue[i] < -s_EventTolerance[i] ) && ( r_NextEventValue[i] > s_EventTolerance[i] ) && ( s_EventDirection[i] >= 0 ) ) ) 247 | { 248 | MPGOS::atomicFMIN(&s_EventTimeStep, -r_ActualEventValue[i] / (r_NextEventValue[i]-r_ActualEventValue[i]) * s_TimeStep); 249 | atomicMax(&s_IsCorrected, 1); 250 | } 251 | } 252 | } 253 | __syncthreads(); 254 | 255 | // Corrected time step and modified update 256 | if ( ( threadIdx.x == 0 ) && ( s_IsCorrected == 1 ) ) 257 | { 258 | if ( s_EventTimeStep < MinimumTimeStep ) 259 | { 260 | printf("Warning: Event cannot be detected without reducing the step size below the minimum! Event detection omitted!, (global system id: %d)\n", GlobalSystemID); 261 | } else 262 | { 263 | s_NewTimeStep = s_EventTimeStep; 264 | s_UpdateStep = 0; 265 | } 266 | } 267 | __syncthreads(); 268 | } 269 | 270 | #endif -------------------------------------------------------------------------------- /GPU_ODE_Julia/bin/catalyst_models/multisite2.net: -------------------------------------------------------------------------------- 1 | # Created by BioNetGen 2.7.0 2 | begin parameters 3 | 1 Rtot 5360 # Constant 4 | 2 Ltot 1160 # Constant 5 | 3 Atot 5360 # Constant 6 | 4 kOnR 0.01 # Constant 7 | 5 kOffR 0.1 # Constant 8 | 6 kCatR 5.0 # Constant 9 | 7 kOnL 0.01 # Constant 10 | 8 kOffL 0.1 # Constant 11 | 9 kCatL 5.1 # Constant 12 | end parameters 13 | begin species 14 | 1 R(a) Rtot 15 | 2 L(a) Ltot 16 | 3 A(p1~U,p2~U,p3~U) Atot 17 | 4 A(p1~U!1,p2~U,p3~U).R(a!1) 0 18 | 5 A(p1~U,p2~U!1,p3~U).R(a!1) 0 19 | 6 A(p1~U,p2~U,p3~U!1).R(a!1) 0 20 | 7 A(p1~U!1,p2~U!2,p3~U).R(a!1).R(a!2) 0 21 | 8 A(p1~U!1,p2~U,p3~U!2).R(a!1).R(a!2) 0 22 | 9 A(p1~P,p2~U,p3~U) 0 23 | 10 A(p1~U,p2~U!1,p3~U!2).R(a!1).R(a!2) 0 24 | 11 A(p1~U,p2~P,p3~U) 0 25 | 12 A(p1~U,p2~U,p3~P) 0 26 | 13 A(p1~U!1,p2~U!2,p3~U!3).R(a!1).R(a!2).R(a!3) 0 27 | 14 A(p1~U!1,p2~P,p3~U).R(a!1) 0 28 | 15 A(p1~U!1,p2~U,p3~P).R(a!1) 0 29 | 16 A(p1~P,p2~U!1,p3~U).R(a!1) 0 30 | 17 A(p1~P,p2~U,p3~U!1).R(a!1) 0 31 | 18 A(p1~P!1,p2~U,p3~U).L(a!1) 0 32 | 19 A(p1~U,p2~U!1,p3~P).R(a!1) 0 33 | 20 A(p1~U,p2~P,p3~U!1).R(a!1) 0 34 | 21 A(p1~U,p2~P!1,p3~U).L(a!1) 0 35 | 22 A(p1~U,p2~U,p3~P!1).L(a!1) 0 36 | 23 A(p1~U!1,p2~U!2,p3~P).R(a!1).R(a!2) 0 37 | 24 A(p1~U!1,p2~P,p3~U!2).R(a!1).R(a!2) 0 38 | 25 A(p1~U!1,p2~P!2,p3~U).L(a!2).R(a!1) 0 39 | 26 A(p1~U!1,p2~U,p3~P!2).L(a!2).R(a!1) 0 40 | 27 A(p1~P,p2~U!1,p3~U!2).R(a!1).R(a!2) 0 41 | 28 A(p1~P,p2~P,p3~U) 0 42 | 29 A(p1~P,p2~U,p3~P) 0 43 | 30 A(p1~P!1,p2~U!2,p3~U).L(a!1).R(a!2) 0 44 | 31 A(p1~P!1,p2~U,p3~U!2).L(a!1).R(a!2) 0 45 | 32 A(p1~U,p2~U!1,p3~P!2).L(a!2).R(a!1) 0 46 | 33 A(p1~U,p2~P,p3~P) 0 47 | 34 A(p1~U,p2~P!1,p3~U!2).L(a!1).R(a!2) 0 48 | 35 A(p1~U!1,p2~U!2,p3~P!3).L(a!3).R(a!1).R(a!2) 0 49 | 36 A(p1~U!1,p2~P,p3~P).R(a!1) 0 50 | 37 A(p1~U!1,p2~P!2,p3~U!3).L(a!2).R(a!1).R(a!3) 0 51 | 38 A(p1~P,p2~U!1,p3~P).R(a!1) 0 52 | 39 A(p1~P,p2~P,p3~U!1).R(a!1) 0 53 | 40 A(p1~P,p2~P!1,p3~U).L(a!1) 0 54 | 41 A(p1~P,p2~U,p3~P!1).L(a!1) 0 55 | 42 A(p1~P!1,p2~U!2,p3~U!3).L(a!1).R(a!2).R(a!3) 0 56 | 43 A(p1~P!1,p2~P,p3~U).L(a!1) 0 57 | 44 A(p1~P!1,p2~U,p3~P).L(a!1) 0 58 | 45 A(p1~U,p2~P,p3~P!1).L(a!1) 0 59 | 46 A(p1~U,p2~P!1,p3~P).L(a!1) 0 60 | 47 A(p1~U!1,p2~P,p3~P!2).L(a!2).R(a!1) 0 61 | 48 A(p1~U!1,p2~P!2,p3~P).L(a!2).R(a!1) 0 62 | 49 A(p1~P,p2~U!1,p3~P!2).L(a!2).R(a!1) 0 63 | 50 A(p1~P,p2~P,p3~P) 0 64 | 51 A(p1~P,p2~P!1,p3~U!2).L(a!1).R(a!2) 0 65 | 52 A(p1~P!1,p2~U!2,p3~P).L(a!1).R(a!2) 0 66 | 53 A(p1~P!1,p2~P,p3~U!2).L(a!1).R(a!2) 0 67 | 54 A(p1~P!1,p2~P!2,p3~U).L(a!1).L(a!2) 0 68 | 55 A(p1~P!1,p2~U,p3~P!2).L(a!1).L(a!2) 0 69 | 56 A(p1~U,p2~P!1,p3~P!2).L(a!1).L(a!2) 0 70 | 57 A(p1~U!1,p2~P!2,p3~P!3).L(a!2).L(a!3).R(a!1) 0 71 | 58 A(p1~P,p2~P,p3~P!1).L(a!1) 0 72 | 59 A(p1~P,p2~P!1,p3~P).L(a!1) 0 73 | 60 A(p1~P!1,p2~U!2,p3~P!3).L(a!1).L(a!3).R(a!2) 0 74 | 61 A(p1~P!1,p2~P,p3~P).L(a!1) 0 75 | 62 A(p1~P!1,p2~P!2,p3~U!3).L(a!1).L(a!2).R(a!3) 0 76 | 63 A(p1~P,p2~P!1,p3~P!2).L(a!1).L(a!2) 0 77 | 64 A(p1~P!1,p2~P,p3~P!2).L(a!1).L(a!2) 0 78 | 65 A(p1~P!1,p2~P!2,p3~P).L(a!1).L(a!2) 0 79 | 66 A(p1~P!1,p2~P!2,p3~P!3).L(a!1).L(a!2).L(a!3) 0 80 | end species 81 | begin reactions 82 | 1 1,3 4 kOnR #_R1 83 | 2 1,3 5 kOnR #_R5 84 | 3 1,3 6 kOnR #_R9 85 | 4 1,5 7 kOnR #_R1 86 | 5 1,6 8 kOnR #_R1 87 | 6 4 1,3 kOffR #_reverse__R1 88 | 7 4 1,9 kCatR #_R2 89 | 8 1,4 7 kOnR #_R5 90 | 9 1,6 10 kOnR #_R5 91 | 10 5 1,3 kOffR #_reverse__R5 92 | 11 5 1,11 kCatR #_R6 93 | 12 1,4 8 kOnR #_R9 94 | 13 1,5 10 kOnR #_R9 95 | 14 6 1,3 kOffR #_reverse__R9 96 | 15 6 1,12 kCatR #_R10 97 | 16 1,10 13 kOnR #_R1 98 | 17 1,11 14 kOnR #_R1 99 | 18 1,12 15 kOnR #_R1 100 | 19 7 1,5 kOffR #_reverse__R1 101 | 20 8 1,6 kOffR #_reverse__R1 102 | 21 7 1,16 kCatR #_R2 103 | 22 8 1,17 kCatR #_R2 104 | 23 2,9 18 kOnL #_R3 105 | 24 1,8 13 kOnR #_R5 106 | 25 1,9 16 kOnR #_R5 107 | 26 1,12 19 kOnR #_R5 108 | 27 7 1,4 kOffR #_reverse__R5 109 | 28 10 1,6 kOffR #_reverse__R5 110 | 29 7 1,14 kCatR #_R6 111 | 30 10 1,20 kCatR #_R6 112 | 31 2,11 21 kOnL #_R7 113 | 32 1,7 13 kOnR #_R9 114 | 33 1,9 17 kOnR #_R9 115 | 34 1,11 20 kOnR #_R9 116 | 35 8 1,4 kOffR #_reverse__R9 117 | 36 10 1,5 kOffR #_reverse__R9 118 | 37 8 1,15 kCatR #_R10 119 | 38 10 1,19 kCatR #_R10 120 | 39 2,12 22 kOnL #_R11 121 | 40 1,19 23 kOnR #_R1 122 | 41 1,20 24 kOnR #_R1 123 | 42 1,21 25 kOnR #_R1 124 | 43 1,22 26 kOnR #_R1 125 | 44 13 1,10 kOffR #_reverse__R1 126 | 45 14 1,11 kOffR #_reverse__R1 127 | 46 15 1,12 kOffR #_reverse__R1 128 | 47 13 1,27 kCatR #_R2 129 | 48 14 1,28 kCatR #_R2 130 | 49 15 1,29 kCatR #_R2 131 | 50 2,16 30 kOnL #_R3 132 | 51 2,17 31 kOnL #_R3 133 | 52 18 2,9 kOffL #_reverse__R3 134 | 53 18 2,3 kCatL #_R4 135 | 54 1,15 23 kOnR #_R5 136 | 55 1,17 27 kOnR #_R5 137 | 56 1,18 30 kOnR #_R5 138 | 57 1,22 32 kOnR #_R5 139 | 58 13 1,8 kOffR #_reverse__R5 140 | 59 16 1,9 kOffR #_reverse__R5 141 | 60 19 1,12 kOffR #_reverse__R5 142 | 61 13 1,24 kCatR #_R6 143 | 62 16 1,28 kCatR #_R6 144 | 63 19 1,33 kCatR #_R6 145 | 64 2,14 25 kOnL #_R7 146 | 65 2,20 34 kOnL #_R7 147 | 66 21 2,11 kOffL #_reverse__R7 148 | 67 21 2,3 kCatL #_R8 149 | 68 1,14 24 kOnR #_R9 150 | 69 1,16 27 kOnR #_R9 151 | 70 1,18 31 kOnR #_R9 152 | 71 1,21 34 kOnR #_R9 153 | 72 13 1,7 kOffR #_reverse__R9 154 | 73 17 1,9 kOffR #_reverse__R9 155 | 74 20 1,11 kOffR #_reverse__R9 156 | 75 13 1,23 kCatR #_R10 157 | 76 17 1,29 kCatR #_R10 158 | 77 20 1,33 kCatR #_R10 159 | 78 2,15 26 kOnL #_R11 160 | 79 2,19 32 kOnL #_R11 161 | 80 22 2,12 kOffL #_reverse__R11 162 | 81 22 2,3 kCatL #_R12 163 | 82 1,32 35 kOnR #_R1 164 | 83 1,33 36 kOnR #_R1 165 | 84 1,34 37 kOnR #_R1 166 | 85 23 1,19 kOffR #_reverse__R1 167 | 86 24 1,20 kOffR #_reverse__R1 168 | 87 25 1,21 kOffR #_reverse__R1 169 | 88 26 1,22 kOffR #_reverse__R1 170 | 89 23 1,38 kCatR #_R2 171 | 90 24 1,39 kCatR #_R2 172 | 91 25 1,40 kCatR #_R2 173 | 92 26 1,41 kCatR #_R2 174 | 93 2,27 42 kOnL #_R3 175 | 94 2,28 43 kOnL #_R3 176 | 95 2,29 44 kOnL #_R3 177 | 96 30 2,16 kOffL #_reverse__R3 178 | 97 31 2,17 kOffL #_reverse__R3 179 | 98 30 2,5 kCatL #_R4 180 | 99 31 2,6 kCatL #_R4 181 | 100 1,26 35 kOnR #_R5 182 | 101 1,29 38 kOnR #_R5 183 | 102 1,31 42 kOnR #_R5 184 | 103 23 1,15 kOffR #_reverse__R5 185 | 104 27 1,17 kOffR #_reverse__R5 186 | 105 30 1,18 kOffR #_reverse__R5 187 | 106 32 1,22 kOffR #_reverse__R5 188 | 107 23 1,36 kCatR #_R6 189 | 108 27 1,39 kCatR #_R6 190 | 109 30 1,43 kCatR #_R6 191 | 110 32 1,45 kCatR #_R6 192 | 111 2,24 37 kOnL #_R7 193 | 112 2,28 40 kOnL #_R7 194 | 113 2,33 46 kOnL #_R7 195 | 114 25 2,14 kOffL #_reverse__R7 196 | 115 34 2,20 kOffL #_reverse__R7 197 | 116 25 2,4 kCatL #_R8 198 | 117 34 2,6 kCatL #_R8 199 | 118 1,25 37 kOnR #_R9 200 | 119 1,28 39 kOnR #_R9 201 | 120 1,30 42 kOnR #_R9 202 | 121 24 1,14 kOffR #_reverse__R9 203 | 122 27 1,16 kOffR #_reverse__R9 204 | 123 31 1,18 kOffR #_reverse__R9 205 | 124 34 1,21 kOffR #_reverse__R9 206 | 125 24 1,36 kCatR #_R10 207 | 126 27 1,38 kCatR #_R10 208 | 127 31 1,44 kCatR #_R10 209 | 128 34 1,46 kCatR #_R10 210 | 129 2,23 35 kOnL #_R11 211 | 130 2,29 41 kOnL #_R11 212 | 131 2,33 45 kOnL #_R11 213 | 132 26 2,15 kOffL #_reverse__R11 214 | 133 32 2,19 kOffL #_reverse__R11 215 | 134 26 2,4 kCatL #_R12 216 | 135 32 2,5 kCatL #_R12 217 | 136 1,45 47 kOnR #_R1 218 | 137 1,46 48 kOnR #_R1 219 | 138 35 1,32 kOffR #_reverse__R1 220 | 139 36 1,33 kOffR #_reverse__R1 221 | 140 37 1,34 kOffR #_reverse__R1 222 | 141 35 1,49 kCatR #_R2 223 | 142 36 1,50 kCatR #_R2 224 | 143 37 1,51 kCatR #_R2 225 | 144 2,38 52 kOnL #_R3 226 | 145 2,39 53 kOnL #_R3 227 | 146 2,40 54 kOnL #_R3 228 | 147 2,41 55 kOnL #_R3 229 | 148 42 2,27 kOffL #_reverse__R3 230 | 149 43 2,28 kOffL #_reverse__R3 231 | 150 44 2,29 kOffL #_reverse__R3 232 | 151 42 2,10 kCatL #_R4 233 | 152 43 2,11 kCatL #_R4 234 | 153 44 2,12 kCatL #_R4 235 | 154 1,41 49 kOnR #_R5 236 | 155 1,44 52 kOnR #_R5 237 | 156 35 1,26 kOffR #_reverse__R5 238 | 157 38 1,29 kOffR #_reverse__R5 239 | 158 42 1,31 kOffR #_reverse__R5 240 | 159 35 1,47 kCatR #_R6 241 | 160 38 1,50 kCatR #_R6 242 | 161 42 1,53 kCatR #_R6 243 | 162 2,36 48 kOnL #_R7 244 | 163 2,39 51 kOnL #_R7 245 | 164 2,43 54 kOnL #_R7 246 | 165 2,45 56 kOnL #_R7 247 | 166 37 2,24 kOffL #_reverse__R7 248 | 167 40 2,28 kOffL #_reverse__R7 249 | 168 46 2,33 kOffL #_reverse__R7 250 | 169 37 2,8 kCatL #_R8 251 | 170 40 2,9 kCatL #_R8 252 | 171 46 2,12 kCatL #_R8 253 | 172 1,40 51 kOnR #_R9 254 | 173 1,43 53 kOnR #_R9 255 | 174 37 1,25 kOffR #_reverse__R9 256 | 175 39 1,28 kOffR #_reverse__R9 257 | 176 42 1,30 kOffR #_reverse__R9 258 | 177 37 1,48 kCatR #_R10 259 | 178 39 1,50 kCatR #_R10 260 | 179 42 1,52 kCatR #_R10 261 | 180 2,36 47 kOnL #_R11 262 | 181 2,38 49 kOnL #_R11 263 | 182 2,44 55 kOnL #_R11 264 | 183 2,46 56 kOnL #_R11 265 | 184 35 2,23 kOffL #_reverse__R11 266 | 185 41 2,29 kOffL #_reverse__R11 267 | 186 45 2,33 kOffL #_reverse__R11 268 | 187 35 2,7 kCatL #_R12 269 | 188 41 2,9 kCatL #_R12 270 | 189 45 2,11 kCatL #_R12 271 | 190 1,56 57 kOnR #_R1 272 | 191 47 1,45 kOffR #_reverse__R1 273 | 192 48 1,46 kOffR #_reverse__R1 274 | 193 47 1,58 kCatR #_R2 275 | 194 48 1,59 kCatR #_R2 276 | 195 2,49 60 kOnL #_R3 277 | 196 2,50 61 kOnL #_R3 278 | 197 2,51 62 kOnL #_R3 279 | 198 52 2,38 kOffL #_reverse__R3 280 | 199 53 2,39 kOffL #_reverse__R3 281 | 200 54 2,40 kOffL #_reverse__R3 282 | 201 55 2,41 kOffL #_reverse__R3 283 | 202 52 2,19 kCatL #_R4 284 | 203 53 2,20 kCatL #_R4 285 | 204 54 2,21 kCatL #_R4 286 | 205 55 2,22 kCatL #_R4 287 | 206 1,55 60 kOnR #_R5 288 | 207 49 1,41 kOffR #_reverse__R5 289 | 208 52 1,44 kOffR #_reverse__R5 290 | 209 49 1,58 kCatR #_R6 291 | 210 52 1,61 kCatR #_R6 292 | 211 2,47 57 kOnL #_R7 293 | 212 2,50 59 kOnL #_R7 294 | 213 2,53 62 kOnL #_R7 295 | 214 48 2,36 kOffL #_reverse__R7 296 | 215 51 2,39 kOffL #_reverse__R7 297 | 216 54 2,43 kOffL #_reverse__R7 298 | 217 56 2,45 kOffL #_reverse__R7 299 | 218 48 2,15 kCatL #_R8 300 | 219 51 2,17 kCatL #_R8 301 | 220 54 2,18 kCatL #_R8 302 | 221 56 2,22 kCatL #_R8 303 | 222 1,54 62 kOnR #_R9 304 | 223 51 1,40 kOffR #_reverse__R9 305 | 224 53 1,43 kOffR #_reverse__R9 306 | 225 51 1,59 kCatR #_R10 307 | 226 53 1,61 kCatR #_R10 308 | 227 2,48 57 kOnL #_R11 309 | 228 2,50 58 kOnL #_R11 310 | 229 2,52 60 kOnL #_R11 311 | 230 47 2,36 kOffL #_reverse__R11 312 | 231 49 2,38 kOffL #_reverse__R11 313 | 232 55 2,44 kOffL #_reverse__R11 314 | 233 56 2,46 kOffL #_reverse__R11 315 | 234 47 2,14 kCatL #_R12 316 | 235 49 2,16 kCatL #_R12 317 | 236 55 2,18 kCatL #_R12 318 | 237 56 2,21 kCatL #_R12 319 | 238 57 1,56 kOffR #_reverse__R1 320 | 239 57 1,63 kCatR #_R2 321 | 240 2,58 64 kOnL #_R3 322 | 241 2,59 65 kOnL #_R3 323 | 242 60 2,49 kOffL #_reverse__R3 324 | 243 61 2,50 kOffL #_reverse__R3 325 | 244 62 2,51 kOffL #_reverse__R3 326 | 245 60 2,32 kCatL #_R4 327 | 246 61 2,33 kCatL #_R4 328 | 247 62 2,34 kCatL #_R4 329 | 248 60 1,55 kOffR #_reverse__R5 330 | 249 60 1,64 kCatR #_R6 331 | 250 2,58 63 kOnL #_R7 332 | 251 2,61 65 kOnL #_R7 333 | 252 57 2,47 kOffL #_reverse__R7 334 | 253 59 2,50 kOffL #_reverse__R7 335 | 254 62 2,53 kOffL #_reverse__R7 336 | 255 57 2,26 kCatL #_R8 337 | 256 59 2,29 kCatL #_R8 338 | 257 62 2,31 kCatL #_R8 339 | 258 62 1,54 kOffR #_reverse__R9 340 | 259 62 1,65 kCatR #_R10 341 | 260 2,59 63 kOnL #_R11 342 | 261 2,61 64 kOnL #_R11 343 | 262 57 2,48 kOffL #_reverse__R11 344 | 263 58 2,50 kOffL #_reverse__R11 345 | 264 60 2,52 kOffL #_reverse__R11 346 | 265 57 2,25 kCatL #_R12 347 | 266 58 2,28 kCatL #_R12 348 | 267 60 2,30 kCatL #_R12 349 | 268 2,63 66 kOnL #_R3 350 | 269 64 2,58 kOffL #_reverse__R3 351 | 270 65 2,59 kOffL #_reverse__R3 352 | 271 64 2,45 kCatL #_R4 353 | 272 65 2,46 kCatL #_R4 354 | 273 2,64 66 kOnL #_R7 355 | 274 63 2,58 kOffL #_reverse__R7 356 | 275 65 2,61 kOffL #_reverse__R7 357 | 276 63 2,41 kCatL #_R8 358 | 277 65 2,44 kCatL #_R8 359 | 278 2,65 66 kOnL #_R11 360 | 279 63 2,59 kOffL #_reverse__R11 361 | 280 64 2,61 kOffL #_reverse__R11 362 | 281 63 2,40 kCatL #_R12 363 | 282 64 2,43 kCatL #_R12 364 | 283 66 2,63 kOffL #_reverse__R3 365 | 284 66 2,56 kCatL #_R4 366 | 285 66 2,64 kOffL #_reverse__R7 367 | 286 66 2,55 kCatL #_R8 368 | 287 66 2,65 kOffL #_reverse__R11 369 | 288 66 2,54 kCatL #_R12 370 | end reactions 371 | begin groups 372 | 1 Rfree 1 373 | 2 Lfree 2 374 | 3 A1P 9,16,17,18,27,28,29,30,31,38,39,40,41,42,43,44,49,50,51,52,53,54,55,58,59,60,61,62,63,64,65,66 375 | end groups 376 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GPUODEBenchmarks 2 | Comparison of Julia's GPU-based ensemble ODE solvers with other open-source implementations in C++, JAX, and PyTorch. These artifacts are part of the paper: 3 | > Automated Translation and Accelerated Solving of Differential Equations on Multiple GPU Platforms 4 | 5 | **_NOTE:_** This repository is meant to contain scripts for benchmarking existing ensemble ODE solvers. For external purposes, one can directly use the solvers from the respective libraries. 6 | 7 | ### Performance comparison with other open-source ensemble ODE solvers 8 | drawing 9 | 10 | ### Works with NVIDIA, Intel, AMD, and Apple GPUs 11 | drawing 12 | 13 | # Reproduction of the benchmarks 14 | 15 | The methods are written in Julia and are part of the repository 16 | . The benchmark suite also 17 | consists of the raw data, such as simulation times and plots mentioned 18 | in the paper. The supported OS for the benchmark suite is Linux. 19 | 20 | ## Installing Julia 21 | 22 | Firstly, we will need to install Julia. The user can download the 23 | binaries from the official JuliaLang website 24 | [`https://julialang.org/downloads/`](https://julialang.org/downloads/). 25 | Alternatively, one can use the convenience of a Julia version 26 | multiplexer, . The recommended OS 27 | for installation is Linux. The recommended Julia installation version is 28 | v1.8. To use AMD GPUs, please install v1.9. The Julia installation 29 | should also be added to the user's path. 30 | 31 | ## Setting up DiffEqGPU.jl 32 | 33 | ### Installing backends 34 | 35 | The user must install the GPU backend library for testing 36 | DiffEqGPU.jl-related code. 37 | 38 | ```julia 39 | julia> using Pkg 40 | julia> #Run either of them 41 | julia> Pkg.add("CUDA") # NVIDIA GPUs 42 | julia> Pkg.add("AMDGPU") #AMD GPUs 43 | julia> Pkg.add("oneAPI") #Intel GPUs 44 | julia> Pkg.add("Metal") #Apple M series GPUs 45 | ``` 46 | ### Testing DiffEqGPU.jl 47 | 48 | DiffEqGPU.jl is a test suite that regularly checks functionality by 49 | testing features like multiple backend support, event handling, and 50 | automatic differentiation. To test the functionality, one can follow the 51 | below instructions. The user needs to specify the \"backend\" for 52 | example \"CUDA\" for NVIDIA, \"AMDGPU\" for AMD, \"oneAPI\" for Intel 53 | , and \"Metal\" for Apple GPUs. The estimated time of completion is 20 54 | minutes. 55 | ```julia 56 | $ julia --project=. 57 | julia> using Pkg 58 | julia> Pkg.instantiate() 59 | julia> Pkg.precompile() 60 | ``` 61 | Finally, test the package with this command 62 | ```bash 63 | $ backend="CUDA" 64 | $ julia --project=. test_DiffEqGPU.jl $backend 65 | ``` 66 | Additionally, the GitHub discussion 67 | [`https://github.com/SciML/DiffEqGPU.jl/issues/224#issuecomment-1453769679`](https://github.com/SciML/DiffEqGPU.jl/issues/224#issuecomment-1453769679) 68 | highlights the use of textured memory with ODE solvers, accelerates the 69 | code by $2\times$ over CPU. 70 | 71 | ### Continuous Integration and Development 72 | 73 | DiffEqGPU.jl is a fully featured library with regression testing, semver 74 | versioning, and version control. The tests are performed on cloud 75 | machines having a multitude of different GPUs 76 | [`https://buildkite.com/julialang/diffeqgpu-dot-jl/builds/705`](https://buildkite.com/julialang/diffeqgpu-dot-jl/builds/705). 77 | These tests are approximately complete in 30 minutes. The publicly visible 78 | testing framework serves as a testimonial of compatibility with multiple 79 | platforms and said features in the paper. 80 | 81 | ## Testing GPU-accelerated ODE Benchmarks with other programs 82 | 83 | ### Benchmarking Julia (DiffEqGPU.jl) methods 84 | We will need to install CUDA.jl for benchmarking. It is the only backend 85 | compatible with the ODE solvers in JAX, PyTorch, and MPGOS. To do so, 86 | one can follow the below process in the Julia Terminal: 87 | ```julia 88 | $ julia 89 | julia> using Pkg 90 | julia> Pkg.add("CUDA") 91 | ``` 92 | Let's clone the benchmark suite repository to start benchmarking; 93 | ```bash 94 | $ git clone https://github.com/utkarsh530\ 95 | /GPUODEBenchmarks.git 96 | ``` 97 | We will instantiate and pre-compile all the packages beforehand to avoid 98 | the wait times during benchmarking. The folder ./GPU_ODE_Julia contains 99 | all the related scripts for the GPU solvers. 100 | ```bash 101 | $ cd ./GPUODEBenchmarks 102 | $ julia --project=./GPU_ODE_Julia --threads=auto 103 | julia> using Pkg 104 | julia> Pkg.instantiate() 105 | julia> Pkg.precompile() 106 | julia> exit() 107 | ``` 108 | It may take a few minutes to complete (\< 10 minutes). After this, we 109 | can generate the timings of ODE solvers written in Julia. There is a 110 | script to benchmark ODE solvers for the different number of trajectories 111 | to demonstrate scalability and performance. The script invocation and 112 | timings can be generated through the following: 113 | ```bash 114 | $ bash ./run_benchmark.sh -l julia -d gpu -m ode 115 | ``` 116 | It might take around 20 minutes to finish. The flag `-n N` can be used 117 | to specify the upper bound of the trajectories to benchmark. By default 118 | $N = 2^{24}$, where the simulation runs for $n \in 8 \le n < N$, with 119 | the multiples of $4$. 120 | 121 | The data will be generated in the `data/Julia` directory, with two files 122 | for fixed and adaptive time-stepping simulations. The first column in 123 | the \".txt\" file will be the number of trajectories, and the section 124 | column will contain the time in milliseconds. 125 | 126 | Additionally, to benchmark ODE solvers for other backends: 127 | ```bash 128 | $ N = $((2**24)) 129 | Benchmark 130 | $ backend = "Metal" 131 | $ ./runner_scripts/gpu/run_ode_mult_device.sh\ 132 | $N $backend 133 | ``` 134 | ### Benchmarking C++ (MPGOS) ODE solvers 135 | 136 | Benchmarking MPGOS ODE solvers requires the CUDA C++ compiler to be 137 | installed correctly. The recommended CUDA Toolkit version is \>= 11. The 138 | installation can be checked through: 139 | ```bash 140 | $ nvcc 141 | If the installation exists, it will return 142 | something like this: 143 | nvcc fatal : No input files specified; 144 | use option --help for more information 145 | ``` 146 | If `nvcc` is not found, the user must install the CUDA Toolkit. The 147 | NVIDIA's website lists the resource 148 | [`https://developer.nvidia.com/cuda-downloads`](https://developer.nvidia.com/cuda-downloads) 149 | for installation. 150 | 151 | The MPGOS scripts are in the `GPU_ODE_MPGOS` folder. The file 152 | `GPU_ODE_MPGOS/Lorenz.cu` is the main executed code. However, the MPGOS 153 | programs can be run with the same bash script by changing the arguments 154 | as: 155 | ```bash 156 | $ bash ./run_benchmark.sh -l cpp -d gpu -m ode 157 | ``` 158 | It will generate the data files in the `data/cpp` folder. 159 | 160 | ### Benchmarking JAX (Diffrax) ODE solvers 161 | 162 | Benchmarking JAX-based ODE solvers require installing Python 3.9 and 163 | `conda`. First, we will install all the Python packages for 164 | benchmarking: 165 | ```bash 166 | $ conda env create -f environment.yml 167 | $ conda activate venv_jax 168 | ``` 169 | It should install the correct version of JAX with CUDA enabled and the 170 | Diffrax library. The GitHub 171 | [`https://github.com/google/jax#installation`](https://github.com/google/jax#installation) 172 | is a guide to follow if the installation fails. 173 | 174 | For our purposes, we can benchmark the solvers by: 175 | ```bash 176 | $ bash ./run_benchmark.sh -l jax -d gpu -m ode 177 | ``` 178 | 179 | #### A note on JIT ordering in JAX 180 | 181 | The JIT ordering JAX matters and sometimes can enhance performance if done correctly. We have tested that vmap and JIT ordering does not make a noticeable difference in our case. The results are available at this [Colab notebook](https://colab.research.google.com/drive/1d7G-O5JX31lHbg7jTzzozbo5-Gp7DBEv?usp=sharing). 182 | 183 | ### Benchmarking PyTorch (torchdiffeq) ODE solvers 184 | 185 | Benchmarking PyTorch-based ODE solvers is a similar process compared to 186 | JAX ones. 187 | ```bash 188 | $ conda env create -f environment.yml 189 | $ conda activate venv_torch 190 | ``` 191 | `torchdiffeq` does not fully support vectorized maps with ODE solvers. 192 | To circumvent this, we extended the functionality by rewriting some 193 | library parts. To download it: 194 | ```bash 195 | (venv_torch)$ pip uninstall torchdiffeq 196 | (venv_torch)$ pip uninstall torchdiffeq 197 | (venv_torch)$ pip install git+https://github.com/\ 198 | utkarsh530/torchdiffeq.git@u/vmap 199 | ``` 200 | Then run the benchmarks by: 201 | ```bash 202 | $ bash ./run_benchmark.sh -l pytorch -d gpu -m ode 203 | ``` 204 | ## Comparing GPU acceleration of ODEs with CPUs 205 | 206 | The benchmark suite can also be used to test the GPU acceleration of ODE 207 | solvers in comparison with CPUs. The process for generating simulation 208 | times for GPUs can be done by following the GPU section mentioned earlier. The following bash script 209 | allows the generation of CPU simulation times for ODEs: 210 | ```bash 211 | $ bash ./run_benchmark.sh -l julia -d cpu -m ode 212 | ``` 213 | The simulation times will be generated in `data/CPU`. Each of the 214 | workflow takes approximately 20 minutes to finish. 215 | 216 | ## Benchmarking GPU acceleration of SDEs with CPUs 217 | 218 | The SDE solvers in Julia are benchmarked by comparing them to the 219 | CPU-accelerated simulation. This will benchmark the linear SDE with 220 | three states, as described in the \"Benchmarks and case studies\" 221 | section. To generate simulation times for GPU, do the following: 222 | ```bash 223 | $ bash ./run_benchmark.sh -l julia -d gpu -m sde 224 | ``` 225 | We can generate the simulation times for CPU-accelerated codes through the following: 226 | ```bash 227 | $ bash ./run_benchmark.sh -l julia -d cpu -m sde 228 | ``` 229 | The results will get generated in `data/SDE` and `data/CPU/SDE`, taking 230 | around 10 minutes to complete. 231 | 232 | ## Composability with MPI 233 | 234 | Julia supports Message Passing Interface (MPI) to allow Single Program 235 | Multiple Data (SPMD) type parallel programming. The composability of the 236 | GPU ODE solvers enable seamless integration with MPI, enabling scaling 237 | the ODE solvers to clusters on multiple nodes. 238 | ```julia 239 | $ julia --project=./GPU_ODE_Julia 240 | julia> using Pkg 241 | # install MPI.jl 242 | julia> Pkg.add("MPI") 243 | ``` 244 | An example script solving the Lorenz problem for approximately 1 billion 245 | parameters are available in the `MPI` folder. A SLURM-based script is 246 | shown below. 247 | ```bash 248 | #!/bin/bash 249 | # Slurm Sbatch Options 250 | # Reqeust no. of GPUs/node 251 | #SBATCH --gres=gpu:volta:1 252 | # 1 process per node 253 | #SBATCH -n 5 -N 5 254 | #SBATCH --output="./mpi_scatter_test.log-%j" 255 | # Loading the required module 256 | 257 | # MPI.jl requires a memory pool to be disabled 258 | export JULIA_CUDA_MEMORY_POOL=none 259 | export JULIA_MPI_BINARY=system 260 | # Use local CUDA toolkit installation 261 | export JULIA_CUDA_USE_BINARYBUILDER=false 262 | 263 | source $HOME/.bashrc 264 | module load cuda mpi 265 | 266 | srun hostname > hostfile 267 | time mpiexec julia --project=./GPU_ODE_Julia\ 268 | ./MPI/gpu_ode_mpi.jl 269 | ``` 270 | ## Plotting Results 271 | 272 | The plotting scripts to visualize the simulation times. The scripts are 273 | located in the `runner_scripts/plot` folder. These scripts replicate the 274 | benchmark figures in the paper. The benchmark suite contains the 275 | simulation data generated by authors, which can be used to verify the 276 | plots. Various benchmarks can be plotted, which are described in the 277 | different sections. The plotting scripts are based on Julia. As a 278 | preliminary step: 279 | ```julia 280 | $ cd GPUODEBenchmarks 281 | $ julia project=. 282 | julia> using Pkg 283 | julia> Pkg.instantiate() 284 | julia> Pkg.precompile() 285 | ``` 286 | The plot comparison between Julia, C++, JAX, and PyTorch mentioned in 287 | the paper can be generated by using the below command: 288 | ```bash 289 | $ julia --project=. ./runner_scripts/plot\ 290 | /plot_ode_comp.jl 291 | ``` 292 | The plot will get saved in the `plots` folder. 293 | 294 | Similarly, the other plots in the paper can be generated by running the 295 | different scripts in the folder `runner_scripts/plot`. 296 | ```bash 297 | plot performance of GPU ODE solvers 298 | with multiple backends 299 | $ julia --project=. ./runner_scripts/plot\ 300 | /plot_mult_gpu.jl 301 | plot GPU ODE solvers comparsion with CPUs 302 | $ julia --project=. ./runner_scripts/plot\ 303 | /plot_ode_comp.jl 304 | plot GPU SDE solvers comparsion with CPUs 305 | $ julia --project=. ./runner_scripts/plot\ 306 | /plot_sde_comp.jl 307 | plot CRN Network sim comparison with CPUs 308 | $ julia --project=. ./runner_scripts/plot\ 309 | /plot_sde_crn.jl 310 | ``` 311 | To plot data generated by running the scripts, specify the location of 312 | the `data` as the argument to the mentioned command. 313 | ```bash 314 | $ julia --project=. ./runner_scripts/plot/\ 315 | plot_mult_gpu.jl /path/to/data/ 316 | ``` 317 | -------------------------------------------------------------------------------- /GPU_ODE_MPGOS/SourceCodes/SingleSystem_PerThread_Solver.cuh: -------------------------------------------------------------------------------- 1 | #ifndef SINGLESYSTEM_PERTHREAD_SOLVER_H 2 | #define SINGLESYSTEM_PERTHREAD_SOLVER_H 3 | 4 | #include "MPGOS_Overloaded_MathFunction.cuh" 5 | #include "SingleSystem_PerThread_DenseOutput.cuh" 6 | #include "SingleSystem_PerThread_ExplicitRungeKutta_Steppers.cuh" 7 | #include "SingleSystem_PerThread_ExplicitRungeKutta_ErrorControllers.cuh" 8 | #include "SingleSystem_PerThread_EventHandling.cuh" 9 | 10 | 11 | template 12 | __global__ void SingleSystem_PerThread(Struct_ThreadConfiguration ThreadConfiguration, Struct_GlobalVariables GlobalVariables, Struct_SharedMemoryUsage SharedMemoryUsage, Struct_SolverOptions SolverOptions) 13 | { 14 | // THREAD MANAGEMENT ------------------------------------------------------ 15 | int tid = threadIdx.x + blockIdx.x*blockDim.x; 16 | 17 | 18 | // SHARED MEMORY MANAGEMENT ----------------------------------------------- 19 | // DUE TO REQUIRED MEMORY ALIGMENT: PRECISONS FIRST, INTS NEXT IN DYNAMICALLY ALLOCATED SHARED MEMORY 20 | // MINIMUM ALLOCABLE MEMORY IS 1 21 | extern __shared__ int DynamicSharedMemory[]; 22 | int MemoryShift; 23 | 24 | Precision* gs_SharedParameters = (Precision*)&DynamicSharedMemory; 25 | MemoryShift = (SharedMemoryUsage.PreferSharedMemory == 1 ? NSP : 0); 26 | 27 | int* gs_IntegerSharedParameters = (int*)&gs_SharedParameters[MemoryShift]; 28 | 29 | const bool IsAdaptive = ( Algorithm==RK4 ? 0 : 1 ); 30 | 31 | __shared__ Precision s_RelativeTolerance[ (IsAdaptive==0 ? 1 : SD) ]; 32 | __shared__ Precision s_AbsoluteTolerance[ (IsAdaptive==0 ? 1 : SD) ]; 33 | __shared__ Precision s_EventTolerance[ (NE==0 ? 1 : NE) ]; 34 | __shared__ int s_EventDirection[ (NE==0 ? 1 : NE) ]; 35 | 36 | // Initialise tolerances of adaptive solvers 37 | if ( IsAdaptive == 1 ) 38 | { 39 | const int LaunchesSD = SD / blockDim.x + (SD % blockDim.x == 0 ? 0 : 1); 40 | #pragma unroll 41 | for (int j=0; j(\ 149 | tid, \ 150 | NT, \ 151 | r_DenseOutputIndex, \ 152 | r_ActualTime, \ 153 | r_TimeStep, \ 154 | r_TimeDomain, \ 155 | r_ActualState, \ 156 | r_ControlParameters, \ 157 | gs_SharedParameters, \ 158 | gs_IntegerSharedParameters, \ 159 | r_Accessories, \ 160 | r_IntegerAccessories); 161 | 162 | if ( NE > 0 ) 163 | { 164 | PerThread_EventFunction(\ 165 | tid, \ 166 | NT, \ 167 | r_ActualEventValue, \ 168 | r_ActualTime, \ 169 | r_TimeStep, \ 170 | r_TimeDomain, \ 171 | r_ActualState, \ 172 | r_ControlParameters, \ 173 | gs_SharedParameters, \ 174 | gs_IntegerSharedParameters, \ 175 | r_Accessories, \ 176 | r_IntegerAccessories); 177 | } 178 | 179 | if ( NDO > 0 ) 180 | { 181 | PerThread_StoreDenseOutput(\ 182 | tid, \ 183 | r_UpdateDenseOutput, \ 184 | r_DenseOutputIndex, \ 185 | GlobalVariables.d_DenseOutputTimeInstances, \ 186 | r_ActualTime, \ 187 | GlobalVariables.d_DenseOutputStates, \ 188 | r_ActualState, \ 189 | r_NumberOfSkippedStores, \ 190 | r_DenseOutputActualTime, \ 191 | SolverOptions.DenseOutputMinimumTimeStep, \ 192 | r_TimeDomain[1]); 193 | } 194 | 195 | 196 | // SOLVER MANAGEMENT -------------------------------------------------- 197 | while ( r_TerminateSimulation == 0 ) 198 | { 199 | // INITIALISE TIME STEPPING --------------------------------------- 200 | r_UpdateStep = 1; 201 | r_IsFinite = 1; 202 | r_EndTimeDomainReached = 0; 203 | 204 | r_TimeStep = r_NewTimeStep; 205 | 206 | if ( r_TimeStep > ( r_TimeDomain[1] - r_ActualTime ) ) 207 | { 208 | r_TimeStep = r_TimeDomain[1] - r_ActualTime; 209 | r_EndTimeDomainReached = 1; 210 | } 211 | 212 | 213 | // STEPPER -------------------------------------------------------- 214 | if ( Algorithm == RK4 ) 215 | { 216 | PerThread_Stepper_RK4(\ 217 | tid, \ 218 | r_ActualTime, \ 219 | r_TimeStep, \ 220 | r_ActualState, \ 221 | r_NextState, \ 222 | r_Error, \ 223 | r_IsFinite, \ 224 | r_ControlParameters, \ 225 | gs_SharedParameters, \ 226 | gs_IntegerSharedParameters, \ 227 | r_Accessories, \ 228 | r_IntegerAccessories); 229 | 230 | PerThread_ErrorController_RK4(\ 231 | tid, \ 232 | SolverOptions.InitialTimeStep, \ 233 | r_IsFinite, \ 234 | r_TerminateSimulation, \ 235 | r_NewTimeStep); 236 | } 237 | 238 | if ( Algorithm == RKCK45 ) 239 | { 240 | PerThread_Stepper_RKCK45(\ 241 | tid, \ 242 | r_ActualTime, \ 243 | r_TimeStep, \ 244 | r_ActualState, \ 245 | r_NextState, \ 246 | r_Error, \ 247 | r_IsFinite, \ 248 | r_ControlParameters, \ 249 | gs_SharedParameters, \ 250 | gs_IntegerSharedParameters, \ 251 | r_Accessories, \ 252 | r_IntegerAccessories); 253 | 254 | PerThread_ErrorController_RKCK45(\ 255 | tid, \ 256 | r_TimeStep, \ 257 | r_ActualState, \ 258 | r_NextState, \ 259 | r_Error, \ 260 | s_RelativeTolerance, \ 261 | s_AbsoluteTolerance, \ 262 | r_UpdateStep, \ 263 | r_IsFinite, \ 264 | r_TerminateSimulation, \ 265 | r_NewTimeStep, \ 266 | SolverOptions); 267 | } 268 | 269 | 270 | // NEW EVENT VALUE AND TIME STEP CONTROL--------------------------- 271 | if ( NE > 0 ) 272 | { 273 | PerThread_EventFunction(\ 274 | tid, \ 275 | NT, \ 276 | r_NextEventValue, \ 277 | r_ActualTime+r_TimeStep, \ 278 | r_TimeStep, \ 279 | r_TimeDomain, \ 280 | r_NextState, \ 281 | r_ControlParameters, \ 282 | gs_SharedParameters, \ 283 | gs_IntegerSharedParameters, \ 284 | r_Accessories, \ 285 | r_IntegerAccessories); 286 | 287 | PerThread_EventTimeStepControl(\ 288 | tid, \ 289 | r_UpdateStep, \ 290 | r_TerminateSimulation, \ 291 | r_ActualEventValue, \ 292 | r_NextEventValue, \ 293 | s_EventTolerance, \ 294 | s_EventDirection, \ 295 | r_TimeStep, \ 296 | r_NewTimeStep, \ 297 | SolverOptions.MinimumTimeStep); 298 | } 299 | 300 | 301 | // UPDATE PROCESS ------------------------------------------------- 302 | if ( r_UpdateStep == 1 ) 303 | { 304 | r_ActualTime += r_TimeStep; 305 | 306 | for (int i=0; i(\ 310 | tid, \ 311 | NT, \ 312 | r_UserDefinedTermination, \ 313 | r_ActualTime, \ 314 | r_TimeStep, \ 315 | r_TimeDomain, \ 316 | r_ActualState, \ 317 | r_ControlParameters, \ 318 | gs_SharedParameters, \ 319 | gs_IntegerSharedParameters, \ 320 | r_Accessories, \ 321 | r_IntegerAccessories); 322 | 323 | if ( NE > 0 ) 324 | { 325 | for (int i=0; i s_EventTolerance[i] ) && ( abs(r_NextEventValue[i]) < s_EventTolerance[i] ) && ( s_EventDirection[i] <= 0 ) ) || \ 328 | ( ( r_ActualEventValue[i] < -s_EventTolerance[i] ) && ( abs(r_NextEventValue[i]) < s_EventTolerance[i] ) && ( s_EventDirection[i] >= 0 ) ) ) 329 | { 330 | PerThread_ActionAfterEventDetection(\ 331 | tid, \ 332 | NT, \ 333 | i, \ 334 | r_UserDefinedTermination, \ 335 | r_ActualTime, \ 336 | r_TimeStep, \ 337 | r_TimeDomain, \ 338 | r_ActualState, \ 339 | r_ControlParameters, \ 340 | gs_SharedParameters, \ 341 | gs_IntegerSharedParameters, \ 342 | r_Accessories, \ 343 | r_IntegerAccessories); 344 | } 345 | } 346 | 347 | PerThread_EventFunction(\ 348 | tid, \ 349 | NT, \ 350 | r_NextEventValue, \ 351 | r_ActualTime, \ 352 | r_TimeStep, \ 353 | r_TimeDomain, \ 354 | r_ActualState, \ 355 | r_ControlParameters, \ 356 | gs_SharedParameters, \ 357 | gs_IntegerSharedParameters, \ 358 | r_Accessories, 359 | r_IntegerAccessories); 360 | 361 | for (int i=0; i 0 ) 366 | { 367 | PerThread_DenseOutputStorageCondition(\ 368 | r_ActualTime, \ 369 | r_DenseOutputActualTime, \ 370 | r_DenseOutputIndex, \ 371 | r_NumberOfSkippedStores, \ 372 | r_EndTimeDomainReached, \ 373 | r_UserDefinedTermination, \ 374 | r_UpdateDenseOutput, \ 375 | SolverOptions); 376 | 377 | PerThread_StoreDenseOutput(\ 378 | tid, \ 379 | r_UpdateDenseOutput, \ 380 | r_DenseOutputIndex, \ 381 | GlobalVariables.d_DenseOutputTimeInstances, \ 382 | r_ActualTime, \ 383 | GlobalVariables.d_DenseOutputStates, \ 384 | r_ActualState, \ 385 | r_NumberOfSkippedStores, \ 386 | r_DenseOutputActualTime, \ 387 | SolverOptions.DenseOutputMinimumTimeStep, \ 388 | r_TimeDomain[1]); 389 | } 390 | 391 | if ( ( r_EndTimeDomainReached == 1 ) || ( r_UserDefinedTermination == 1 ) ) 392 | r_TerminateSimulation = 1; 393 | } 394 | } 395 | 396 | 397 | // FINALISATION ----------------------------------------------------------- 398 | PerThread_Finalization(\ 399 | tid, \ 400 | NT, \ 401 | r_DenseOutputIndex, \ 402 | r_ActualTime, \ 403 | r_TimeStep, \ 404 | r_TimeDomain, \ 405 | r_ActualState, \ 406 | r_ControlParameters, \ 407 | gs_SharedParameters, \ 408 | gs_IntegerSharedParameters, \ 409 | r_Accessories, \ 410 | r_IntegerAccessories); 411 | 412 | 413 | // WRITE DATA BACK TO GLOBAL MEMORY --------------------------------------- 414 | #pragma unroll 415 | for (int i=0; i<2; i++) 416 | GlobalVariables.d_TimeDomain[tid + i*NT] = r_TimeDomain[i]; 417 | 418 | #pragma unroll 419 | for (int i=0; i 7 | __forceinline__ __device__ void CoupledSystems_PerBlock_MultipleSystems_MultipleBlockLaunches_StoreDenseOutput(\ 8 | int* s_UpdateStep, \ 9 | int* s_UpdateDenseOutput, \ 10 | int* s_DenseOutputIndex, \ 11 | int* s_NumberOfSkippedStores, \ 12 | Precision* d_DenseOutputTimeInstances, \ 13 | Precision* d_DenseOutputStates, \ 14 | Precision* s_DenseOutputActualTime, \ 15 | Precision* s_ActualTime, \ 16 | Precision r_ActualState[NBL][UD], \ 17 | Precision s_TimeDomain[SPB][2], \ 18 | Struct_ThreadConfiguration ThreadConfiguration, \ 19 | Struct_SolverOptions SolverOptions) 20 | { 21 | int LocalThreadID_GPU = threadIdx.x; 22 | int BlockID = blockIdx.x; 23 | int GlobalThreadID_Logical; 24 | int LocalThreadID_Logical; 25 | int LocalSystemID; 26 | int GlobalSystemID; 27 | int GlobalMemoryID; 28 | int UnitID; 29 | 30 | int SizeOfActualState = ThreadConfiguration.TotalLogicalThreads*UD; 31 | 32 | for (int BL=0; BL 82 | __forceinline__ __device__ void CoupledSystems_PerBlock_MultipleSystems_MultipleBlockLaunches_DenseOutputStorageCondition(\ 83 | int* s_EndTimeDomainReached, \ 84 | int* s_UserDefinedTermination, \ 85 | int* s_UpdateStep, \ 86 | int* s_UpdateDenseOutput, \ 87 | int* s_DenseOutputIndex, \ 88 | int* s_NumberOfSkippedStores, \ 89 | Precision* s_DenseOutputActualTime, \ 90 | Precision* s_ActualTime, \ 91 | Struct_SolverOptions SolverOptions) 92 | { 93 | int Launches = SPB / blockDim.x + (SPB % blockDim.x == 0 ? 0 : 1); 94 | for (int j=0; j= (SolverOptions.DenseOutputSaveFrequency-1) ) ) 103 | s_UpdateDenseOutput[lsid] = 1; 104 | else 105 | s_UpdateDenseOutput[lsid] = 0; 106 | 107 | if ( ( s_DenseOutputIndex[lsid] < NDO ) && ( ( s_EndTimeDomainReached[lsid] == 1 ) || ( s_UserDefinedTermination[lsid] == 1 ) ) ) 108 | s_UpdateDenseOutput[lsid] = 1; 109 | } else 110 | s_UpdateDenseOutput[lsid] = 0; 111 | } 112 | } 113 | __syncthreads(); 114 | } 115 | 116 | 117 | // SSMBL ---------------------------------------------------------------------- 118 | template 119 | __forceinline__ __device__ void CoupledSystems_PerBlock_SingleSystem_MultipleBlockLaunches_StoreDenseOutput(\ 120 | int s_UpdateStep, \ 121 | int s_UpdateDenseOutput, \ 122 | int& s_DenseOutputIndex, \ 123 | int& s_NumberOfSkippedStores, \ 124 | Precision* d_DenseOutputTimeInstances, \ 125 | Precision* d_DenseOutputStates, \ 126 | Precision& s_DenseOutputActualTime, \ 127 | Precision s_ActualTime, \ 128 | Precision r_ActualState[NBL][UD], \ 129 | Precision s_TimeDomain[2], \ 130 | Struct_ThreadConfiguration ThreadConfiguration, \ 131 | Struct_SolverOptions SolverOptions) 132 | { 133 | int LocalThreadID_GPU = threadIdx.x; 134 | int GlobalSystemID = blockIdx.x; 135 | int GlobalThreadID_Logical; 136 | int UnitID; 137 | int GlobalMemoryID; 138 | 139 | int SizeOfActualState = ThreadConfiguration.TotalLogicalThreads*UD; 140 | 141 | for (int BL=0; BL 182 | __forceinline__ __device__ void CoupledSystems_PerBlock_SingleSystem_MultipleBlockLaunches_DenseOutputStorageCondition(\ 183 | int s_EndTimeDomainReached, \ 184 | int s_UserDefinedTermination, \ 185 | int s_UpdateStep, \ 186 | int& s_UpdateDenseOutput, \ 187 | int s_DenseOutputIndex, \ 188 | int s_NumberOfSkippedStores, \ 189 | Precision s_DenseOutputActualTime, \ 190 | Precision s_ActualTime, \ 191 | Struct_SolverOptions SolverOptions) 192 | { 193 | if ( threadIdx.x == 0 ) 194 | { 195 | if ( s_UpdateStep == 1 ) 196 | { 197 | if ( ( s_DenseOutputIndex < NDO ) && ( s_DenseOutputActualTime < s_ActualTime ) && ( s_NumberOfSkippedStores >= (SolverOptions.DenseOutputSaveFrequency-1) ) ) 198 | s_UpdateDenseOutput = 1; 199 | else 200 | s_UpdateDenseOutput = 0; 201 | 202 | if ( ( s_DenseOutputIndex < NDO ) && ( ( s_EndTimeDomainReached == 1 ) || ( s_UserDefinedTermination == 1 ) ) ) 203 | s_UpdateDenseOutput = 1; 204 | } else 205 | s_UpdateDenseOutput = 0; 206 | } 207 | __syncthreads(); 208 | } 209 | 210 | 211 | // MSSBL ---------------------------------------------------------------------- 212 | template 213 | __forceinline__ __device__ void CoupledSystems_PerBlock_MultipleSystems_SingleBlockLaunch_StoreDenseOutput(\ 214 | const int BlockID, \ 215 | const int LocalSystemID, \ 216 | const int UnitID, \ 217 | const int GlobalSystemID, \ 218 | int* s_UpdateStep, \ 219 | int* s_UpdateDenseOutput, \ 220 | int* s_DenseOutputIndex, \ 221 | int* s_NumberOfSkippedStores, \ 222 | Precision* d_DenseOutputTimeInstances, \ 223 | Precision* d_DenseOutputStates, \ 224 | Precision* s_DenseOutputActualTime, \ 225 | Precision* s_ActualTime, \ 226 | Precision r_ActualState[UD], \ 227 | Precision s_TimeDomain[SPB][2], \ 228 | Struct_ThreadConfiguration ThreadConfiguration, \ 229 | Struct_SolverOptions SolverOptions) 230 | { 231 | int GlobalThreadID_Logical; 232 | int GlobalMemoryID; 233 | 234 | int SizeOfActualState = ThreadConfiguration.TotalLogicalThreads*UD; 235 | 236 | if ( ( LocalSystemID < SPB ) && ( s_UpdateDenseOutput[LocalSystemID] == 1 ) ) 237 | { 238 | for (int i=0; i 277 | __forceinline__ __device__ void CoupledSystems_PerBlock_MultipleSystems_SingleBlockLaunch_DenseOutputStorageCondition(\ 278 | int* s_EndTimeDomainReached, \ 279 | int* s_UserDefinedTermination, \ 280 | int* s_UpdateStep, \ 281 | int* s_UpdateDenseOutput, \ 282 | int* s_DenseOutputIndex, \ 283 | int* s_NumberOfSkippedStores, \ 284 | Precision* s_DenseOutputActualTime, \ 285 | Precision* s_ActualTime, \ 286 | Struct_SolverOptions SolverOptions) 287 | { 288 | int Launches = SPB / blockDim.x + (SPB % blockDim.x == 0 ? 0 : 1); 289 | for (int j=0; j= (SolverOptions.DenseOutputSaveFrequency-1) ) ) 298 | s_UpdateDenseOutput[lsid] = 1; 299 | else 300 | s_UpdateDenseOutput[lsid] = 0; 301 | 302 | if ( ( s_DenseOutputIndex[lsid] < NDO ) && ( ( s_EndTimeDomainReached[lsid] == 1 ) || ( s_UserDefinedTermination[lsid] == 1 ) ) ) 303 | s_UpdateDenseOutput[lsid] = 1; 304 | } else 305 | s_UpdateDenseOutput[lsid] = 0; 306 | } 307 | } 308 | __syncthreads(); 309 | } 310 | 311 | 312 | // SSSBL ---------------------------------------------------------------------- 313 | template 314 | __forceinline__ __device__ void CoupledSystems_PerBlock_SingleSystem_SingleBlockLaunch_StoreDenseOutput(\ 315 | const int LocalThreadID, \ 316 | const int GlobalThreadID, \ 317 | const int GlobalSystemID, \ 318 | int s_UpdateStep, \ 319 | int s_UpdateDenseOutput, \ 320 | int& s_DenseOutputIndex, \ 321 | int& s_NumberOfSkippedStores, \ 322 | Precision* d_DenseOutputTimeInstances, \ 323 | Precision* d_DenseOutputStates, \ 324 | Precision& s_DenseOutputActualTime, \ 325 | Precision s_ActualTime, \ 326 | Precision r_ActualState[UD], \ 327 | Precision s_TimeDomain[2], \ 328 | Struct_ThreadConfiguration ThreadConfiguration, \ 329 | Struct_SolverOptions SolverOptions) 330 | { 331 | int GlobalMemoryID; 332 | 333 | int SizeOfActualState = ThreadConfiguration.TotalLogicalThreads*UD; 334 | 335 | if ( ( LocalThreadID < UPS ) && ( s_UpdateDenseOutput == 1 ) ) 336 | { 337 | for (int i=0; i 369 | __forceinline__ __device__ void CoupledSystems_PerBlock_SingleSystem_SingleBlockLaunch_DenseOutputStorageCondition(\ 370 | int s_EndTimeDomainReached, \ 371 | int s_UserDefinedTermination, \ 372 | int s_UpdateStep, \ 373 | int& s_UpdateDenseOutput, \ 374 | int s_DenseOutputIndex, \ 375 | int s_NumberOfSkippedStores, \ 376 | Precision s_DenseOutputActualTime, \ 377 | Precision s_ActualTime, \ 378 | Struct_SolverOptions SolverOptions) 379 | { 380 | if ( threadIdx.x == 0 ) 381 | { 382 | if ( s_UpdateStep == 1 ) 383 | { 384 | if ( ( s_DenseOutputIndex < NDO ) && ( s_DenseOutputActualTime < s_ActualTime ) && ( s_NumberOfSkippedStores >= (SolverOptions.DenseOutputSaveFrequency-1) ) ) 385 | s_UpdateDenseOutput = 1; 386 | else 387 | s_UpdateDenseOutput = 0; 388 | 389 | if ( ( s_DenseOutputIndex < NDO ) && ( ( s_EndTimeDomainReached == 1 ) || ( s_UserDefinedTermination == 1 ) ) ) 390 | s_UpdateDenseOutput = 1; 391 | } else 392 | s_UpdateDenseOutput = 0; 393 | } 394 | __syncthreads(); 395 | } 396 | 397 | #endif -------------------------------------------------------------------------------- /GPU_ODE_MPGOS/SourceCodes/TMP.cuh: -------------------------------------------------------------------------------- 1 | // SINGLE SYSTEM PERTHREAD 2 | 3 | // Test shared memory 4 | if ( tid == 0 ) 5 | { 6 | for (int i=0; i= SPB ) || ( GlobalSystemID >= NS ) || ( GlobalSystemID >= SolverOptions.ActiveSystems ) ) 90 | LimitReached = 1; 91 | else 92 | LimitReached = 0; 93 | 94 | printf("GlbTID_Log: %d, GlbTID_GPU: %d, BlockID: %d, LocTID_Log: %d, LocTID_GPU: %d, GlbSID: %d, LocSID: %d, UnitID: %d, LIMIT: %d \n", \ 95 | GlobalThreadID_Logical, GlobalThreadID_GPU, BlockID, LocalThreadID_Logical, LocalThreadID_GPU, GlobalSystemID, LocalSystemID, UnitID, LimitReached); 96 | } 97 | 98 | // Testing block scope shared memory variables 99 | if ( ( threadIdx.x == 0 ) ) 100 | { 101 | printf("Block ID: %d, s_TerminatedSystemsPerBlock: %d \n", blockIdx.x, s_TerminatedSystemsPerBlock); 102 | printf("Block ID: %d, s_TSS[0]: %d, s_TSS[1]: %d, s_TSS[2]: %d \n", blockIdx.x, s_TerminateSystemScope[0], s_TerminateSystemScope[1], s_TerminateSystemScope[2]); 103 | } 104 | 105 | // Testing global shared memory variables 106 | if ( ( blockIdx.x == 0 ) && ( threadIdx.x == 0 ) ) 107 | { 108 | for (int i=0; i 0 ) ) 186 | { 187 | int idx; 188 | for (int row=0; row 0 ) ) 201 | { 202 | int idx; 203 | for (int col=0; col<(2*CBW+1); col++) 204 | { 205 | idx = col + MemoryShift; 206 | printf("%6.3e ", gs_CouplingMatrix[idx]); 207 | } 208 | printf("\n"); 209 | } 210 | 211 | // Full circular 212 | if ( ( CCI == 1 ) && ( CBW == 0 ) ) 213 | { 214 | int idx; 215 | for (int col=0; col