├── texture_memory
    ├── README.md
    ├── forecast.txt
    ├── Project.toml
    └── test_texturemem.jl
├── .gitignore
├── GPU_ODE_Julia
    ├── .JuliaFormatter.toml
    ├── Project.toml
    ├── bin
    │   ├── ode_problems
    │   │   ├── linear.jl
    │   │   ├── multistate.jl
    │   │   ├── lorenz.jl
    │   │   ├── multisite2.jl
    │   │   ├── pleiades.jl
    │   │   └── pollu.jl
    │   └── catalyst_models
    │   │   ├── multistate.net
    │   │   └── multisite2.net
    ├── src
    │   └── GPU_ODE_Julia.jl
    ├── sde_examples
    │   ├── bench_cpu.jl
    │   ├── bench_gpu.jl
    │   └── bench_crn_model.jl
    ├── bench_cpu.jl
    ├── bench_lorenz_gpu.jl
    ├── bench_ensemblegpuarray.jl
    └── bench_multi_device.jl
├── paper_artifacts
    ├── figures
    │   ├── CPU_Lorenz.png
    │   ├── CPU_SDE_MTK.png
    │   ├── CPU_SDE_Linear.png
    │   ├── Lorenz_adaptive.png
    │   ├── Lorenz_unadaptive.png
    │   ├── CPU_Lorenz_adaptive.png
    │   ├── CPU_Lorenz_unadaptive.png
    │   └── Multi_GPU_unadaptive.png
    └── data
    │   ├── Julia
    │       ├── SDE
    │       │   ├── CRN
    │       │   │   └── Julia_times_unadaptive.txt
    │       │   └── Julia_times_unadaptive.txt
    │       ├── CPU
    │       │   ├── SDE
    │       │   │   ├── CRN
    │       │   │   │   └── Julia_times_unadaptive.txt
    │       │   │   └── times_unadaptive.txt
    │       │   ├── times_adaptive.txt
    │       │   └── times_unadaptive.txt
    │       ├── devices
    │       │   ├── oneAPI
    │       │   │   ├── Julia_times_adaptive.txt
    │       │   │   └── Julia_times_unadaptive.txt
    │       │   ├── CUDA
    │       │   │   ├── Julia_times_adaptive.txt
    │       │   │   └── Julia_times_unadaptive.txt
    │       │   ├── Metal
    │       │   │   ├── Julia_times_adaptive.txt
    │       │   │   └── Julia_times_unadaptive.txt
    │       │   └── AMDGPU
    │       │   │   ├── Julia_times_adaptive.txt
    │       │   │   └── Julia_times_unadaptive.txt
    │       └── EnsembleGPUArray
    │       │   ├── Julia_EnGPUArray_times_adaptive.txt
    │       │   └── Julia_EnGPUArray_times_unadaptive.txt
    │   ├── RTX_5000
    │       ├── CPP
    │       │   ├── MPGOS_times_adaptive.txt
    │       │   └── MPGOS_times_unadaptive.txt
    │       ├── Julia
    │       │   ├── Julia_times_adaptive.txt
    │       │   └── Julia_times_unadaptive.txt
    │       ├── JAX
    │       │   ├── Jax_times_adaptive.txt
    │       │   └── Jax_times_unadaptive.txt
    │       └── PyTorch
    │       │   └── Torch_times_unadaptive.txt
    │   └── Tesla_V100
    │       ├── CPP
    │           ├── MPGOS_times_adaptive.txt
    │           └── MPGOS_times_unadaptive.txt
    │       ├── Julia
    │           ├── Julia_times_unadaptive.txt
    │           └── Julia_times_adaptive.txt
    │       ├── JAX
    │           ├── Jax_times_unadaptive.txt
    │           └── Jax_times_adaptive.txt
    │       └── PyTorch
    │           └── Torch_times_unadaptive.txt
├── runner_scripts
    ├── gpu
    │   ├── run_ode_julia.sh
    │   ├── run_ode_pytorch.sh
    │   ├── run_ode_jax.sh
    │   ├── run_egarray_julia.sh
    │   ├── run_sde_julia.sh
    │   ├── run_ode_mult_device.sh
    │   ├── run_sde_crn.sh
    │   └── run_ode_cpp.sh
    ├── cpu
    │   ├── run_ode_julia.sh
    │   └── run_sde_julia.sh
    └── plot
    │   ├── plot_sde_comp.jl
    │   ├── plot_mult_gpu.jl
    │   ├── plot_sde_crn.jl
    │   ├── plot_ode_comp.jl
    │   └── plot_cpu_comp.jl
├── test_DiffEqGPU.jl
├── Project.toml
├── GPU_ODE_MPGOS
    ├── makefile
    ├── ProfileSpec.sh
    ├── SourceCodes
    │   ├── SingleSystem_PerThread_EventHandling.cuh
    │   ├── SingleSystem_PerThread_DenseOutput.cuh
    │   ├── MPGOS_Overloaded_MathFunction.cuh
    │   ├── SingleSystem_PerThread_ExplicitRungeKutta_ErrorControllers.cuh
    │   ├── SingleSystem_PerThread_ExplicitRungeKutta_Steppers.cuh
    │   ├── CoupledSystems_PerBlock_EventHandling.cuh
    │   ├── SingleSystem_PerThread_Solver.cuh
    │   ├── CoupledSystems_PerBlock_DenseOutput.cuh
    │   └── TMP.cuh
    ├── Lorenz_SystemDefinition.cuh
    └── Lorenz.cu
├── MPI
    ├── mpi_test.sh
    └── gpu_ode_mpi.jl
├── GPU_ODE_JAX
    ├── requirements.txt
    ├── environment.yml
    └── bench_diffrax.py
├── LICENSE
├── GPU_ODE_PyTorch
    ├── requirements.txt
    ├── bench_torchdiffeq.py
    └── environment.yml
├── run_benchmark.sh
└── README.md


/texture_memory/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | plots
2 | *.exe
3 | /data


--------------------------------------------------------------------------------
/GPU_ODE_Julia/.JuliaFormatter.toml:
--------------------------------------------------------------------------------
1 | style = "sciml"


--------------------------------------------------------------------------------
/texture_memory/forecast.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utkarsh530/GPUODEBenchmarks/HEAD/texture_memory/forecast.txt


--------------------------------------------------------------------------------
/paper_artifacts/figures/CPU_Lorenz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utkarsh530/GPUODEBenchmarks/HEAD/paper_artifacts/figures/CPU_Lorenz.png


--------------------------------------------------------------------------------
/paper_artifacts/figures/CPU_SDE_MTK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utkarsh530/GPUODEBenchmarks/HEAD/paper_artifacts/figures/CPU_SDE_MTK.png


--------------------------------------------------------------------------------
/paper_artifacts/figures/CPU_SDE_Linear.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utkarsh530/GPUODEBenchmarks/HEAD/paper_artifacts/figures/CPU_SDE_Linear.png


--------------------------------------------------------------------------------
/paper_artifacts/figures/Lorenz_adaptive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utkarsh530/GPUODEBenchmarks/HEAD/paper_artifacts/figures/Lorenz_adaptive.png


--------------------------------------------------------------------------------
/paper_artifacts/figures/Lorenz_unadaptive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utkarsh530/GPUODEBenchmarks/HEAD/paper_artifacts/figures/Lorenz_unadaptive.png


--------------------------------------------------------------------------------
/paper_artifacts/figures/CPU_Lorenz_adaptive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utkarsh530/GPUODEBenchmarks/HEAD/paper_artifacts/figures/CPU_Lorenz_adaptive.png


--------------------------------------------------------------------------------
/paper_artifacts/figures/CPU_Lorenz_unadaptive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utkarsh530/GPUODEBenchmarks/HEAD/paper_artifacts/figures/CPU_Lorenz_unadaptive.png


--------------------------------------------------------------------------------
/paper_artifacts/figures/Multi_GPU_unadaptive.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/utkarsh530/GPUODEBenchmarks/HEAD/paper_artifacts/figures/Multi_GPU_unadaptive.png


--------------------------------------------------------------------------------
/paper_artifacts/data/Julia/SDE/CRN/Julia_times_unadaptive.txt:
--------------------------------------------------------------------------------
1 | 6720 244.754027
2 | 26880 620.783003
3 | 107520 2042.824231
4 | 430080 7484.933462
5 | 1720320 29404.353759
6 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/Julia/CPU/SDE/CRN/Julia_times_unadaptive.txt:
--------------------------------------------------------------------------------
1 | 6720 921.966429
2 | 26880 1944.847764
3 | 107520 6746.065749
4 | 430080 27894.174424
5 | 1720320 105755.144901


--------------------------------------------------------------------------------
/paper_artifacts/data/Julia/CPU/times_adaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 0.032243
 2 | 32 0.048715
 3 | 128 0.107526
 4 | 512 0.326734
 5 | 2048 1.155272
 6 | 8192 4.821152
 7 | 32768 21.28074
 8 | 131072 98.841839
 9 | 524288 394.983798
10 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/Julia/CPU/times_unadaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 0.125069
 2 | 32 0.219333
 3 | 128 0.716072
 4 | 512 2.721196
 5 | 2048 11.163812
 6 | 8192 43.882179
 7 | 32768 175.232735
 8 | 131072 707.112045
 9 | 524288 3056.280229
10 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/RTX_5000/CPP/MPGOS_times_adaptive.txt:
--------------------------------------------------------------------------------
 1 | 8	0.144
 2 | 32	0.147
 3 | 128	0.152
 4 | 512	0.153
 5 | 2048	0.153
 6 | 8192	0.245
 7 | 32768	0.814
 8 | 131072	3.113
 9 | 524288	12.135
10 | 2097152	48.058
11 | 8388608	191.764
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/Tesla_V100/CPP/MPGOS_times_adaptive.txt:
--------------------------------------------------------------------------------
 1 | 8	2.078
 2 | 32	1.921
 3 | 128	1.889
 4 | 512	1.963
 5 | 2048	1.921
 6 | 8192	1.908
 7 | 32768	1.942
 8 | 131072	2.256
 9 | 524288	3.575
10 | 2097152	11.046
11 | 8388608	48.808
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/Tesla_V100/CPP/MPGOS_times_unadaptive.txt:
--------------------------------------------------------------------------------
 1 | 8	2.118
 2 | 32	2.092
 3 | 128	2.04
 4 | 512	1.992
 5 | 2048	2.075
 6 | 8192	2.023
 7 | 32768	2.208
 8 | 131072	3.025
 9 | 524288	6.421
10 | 2097152	23.62
11 | 8388608	94.157
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/RTX_5000/CPP/MPGOS_times_unadaptive.txt:
--------------------------------------------------------------------------------
 1 | 8	0.426
 2 | 32	0.42
 3 | 128	0.474
 4 | 512	0.473
 5 | 2048	0.479
 6 | 8192	1.121
 7 | 32768	4.12
 8 | 131072	16.196
 9 | 524288	63.315
10 | 2097152	248.314
11 | 8388608	886.082
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/Tesla_V100/Julia/Julia_times_unadaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 0.231724
 2 | 32 0.208332
 3 | 128 0.232428
 4 | 512 0.209701
 5 | 2048 0.225962
 6 | 8192 0.238633
 7 | 32768 0.785327
 8 | 131072 3.443626
 9 | 524288 9.74068
10 | 2097152 36.68774
11 | 8388608 145.316839


--------------------------------------------------------------------------------
/runner_scripts/gpu/run_ode_julia.sh:
--------------------------------------------------------------------------------
 1 | a=8
 2 | max_a=$1
 3 | while [ $a -le $max_a ]
 4 | do
 5 | 	# Print the values
 6 | 	echo $a
 7 | 	julia --project="./GPU_ODE_Julia/" ./GPU_ODE_Julia/bench_lorenz_gpu.jl $a
 8 | 	# increment the value
 9 | 	a=$((a*4))
10 | done
11 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/Julia/SDE/Julia_times_unadaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 0.381258
 2 | 32 0.379741
 3 | 128 0.419727
 4 | 512 0.620064
 5 | 2048 1.420111
 6 | 8192 2.416906
 7 | 32768 4.512974
 8 | 131072 14.118681
 9 | 524288 48.056674
10 | 2097152 181.737472
11 | 8388608 715.859498
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/Julia/devices/oneAPI/Julia_times_adaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 0.804296
 2 | 32 0.790966
 3 | 128 0.815685
 4 | 512 0.888314
 5 | 2048 0.877024
 6 | 8192 1.022843
 7 | 32768 1.186602
 8 | 131072 2.071555
 9 | 524288 6.023225
10 | 2097152 22.745966
11 | 8388608 88.253276


--------------------------------------------------------------------------------
/paper_artifacts/data/Julia/devices/oneAPI/Julia_times_unadaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 0.789936
 2 | 32 0.728326
 3 | 128 0.814415
 4 | 512 0.896874
 5 | 2048 0.892524
 6 | 8192 1.235761
 7 | 32768 1.983526
 8 | 131072 3.438985
 9 | 524288 12.007101
10 | 2097152 46.710124
11 | 8388608 185.5231


--------------------------------------------------------------------------------
/paper_artifacts/data/Tesla_V100/Julia/Julia_times_adaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 0.301317
 2 | 32 0.301541
 3 | 128 0.30454
 4 | 512 0.330246
 5 | 2048 0.349348
 6 | 8192 0.368268
 7 | 32768 0.61695
 8 | 131072 1.582687
 9 | 524288 4.655955
10 | 2097152 17.840218
11 | 8388608 69.451464
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/Julia/devices/CUDA/Julia_times_adaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 0.252927
 2 | 32 0.259247
 3 | 128 0.261577
 4 | 512 0.272637
 5 | 2048 0.294848
 6 | 8192 0.370816
 7 | 32768 0.754333
 8 | 131072 1.665724
 9 | 524288 5.511288
10 | 2097152 20.722764
11 | 8388608 83.490168
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/Julia/devices/Metal/Julia_times_adaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 0.967709
 2 | 32 0.999
 3 | 128 1.013375
 4 | 512 1.187959
 5 | 2048 1.373083
 6 | 8192 1.471291
 7 | 32768 2.381292
 8 | 131072 6.039792
 9 | 524288 20.657333
10 | 2097152 79.474417
11 | 8388608 317.480458
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/RTX_5000/Julia/Julia_times_adaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 0.252927
 2 | 32 0.259247
 3 | 128 0.261577
 4 | 512 0.272637
 5 | 2048 0.294848
 6 | 8192 0.370816
 7 | 32768 0.754333
 8 | 131072 1.665724
 9 | 524288 5.511288
10 | 2097152 20.722764
11 | 8388608 83.490168


--------------------------------------------------------------------------------
/paper_artifacts/data/Julia/CPU/SDE/times_unadaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 0.037384
 2 | 32 0.154911
 3 | 128 0.551648
 4 | 512 2.20597
 5 | 2048 4.844585
 6 | 8192 5.201728
 7 | 32768 10.771976
 8 | 131072 43.050631
 9 | 524288 170.593541
10 | 2097152 696.775282
11 | 8388608 6702.486593
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/Julia/devices/AMDGPU/Julia_times_adaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 2.231099
 2 | 32 2.19581
 3 | 128 2.394492
 4 | 512 2.858904
 5 | 2048 4.108183
 6 | 8192 4.520914
 7 | 32768 16.186817
 8 | 131072 91.010872
 9 | 524288 380.084487
10 | 2097152 1536.688493
11 | 8388608 6163.51325
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/Julia/devices/AMDGPU/Julia_times_unadaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 0.843078
 2 | 32 0.838157
 3 | 128 0.832271
 4 | 512 1.349344
 5 | 2048 2.322879
 6 | 8192 2.332571
 7 | 32768 2.372267
 8 | 131072 4.596871
 9 | 524288 17.053023
10 | 2097152 66.638069
11 | 8388608 273.612436
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/Julia/devices/CUDA/Julia_times_unadaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 0.168368
 2 | 32 0.171038
 3 | 128 0.211548
 4 | 512 0.209778
 5 | 2048 0.213208
 6 | 8192 0.331017
 7 | 32768 0.904711
 8 | 131072 3.280279
 9 | 524288 11.920956
10 | 2097152 46.264071
11 | 8388608 183.919764
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/Julia/devices/Metal/Julia_times_unadaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 0.716333
 2 | 32 0.742291
 3 | 128 0.721709
 4 | 512 1.254791
 5 | 2048 1.971708
 6 | 8192 2.002875
 7 | 32768 2.038125
 8 | 131072 6.25925
 9 | 524288 23.261792
10 | 2097152 90.567667
11 | 8388608 364.321583
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/RTX_5000/Julia/Julia_times_unadaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 0.168368
 2 | 32 0.171038
 3 | 128 0.211548
 4 | 512 0.209778
 5 | 2048 0.213208
 6 | 8192 0.331017
 7 | 32768 0.904711
 8 | 131072 3.280279
 9 | 524288 11.920956
10 | 2097152 46.264071
11 | 8388608 183.919764


--------------------------------------------------------------------------------
/runner_scripts/gpu/run_ode_pytorch.sh:
--------------------------------------------------------------------------------
 1 | a=8
 2 | max_a=$1
 3 | while [ $a -le $max_a ]
 4 | do
 5 |     	# Print the values
 6 |     	echo "No. of trajectories = $a"
 7 | 		python3 ./GPU_ODE_PyTorch/bench_torchdiffeq.py $a	
 8 |     	# increment the value
 9 |     	a=$((a*4))
10 | done
11 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/Julia/EnsembleGPUArray/Julia_EnGPUArray_times_adaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 15.947266
 2 | 32 15.95629
 3 | 128 16.423258
 4 | 512 18.110998
 5 | 2048 18.736388
 6 | 8192 19.82563
 7 | 32768 24.065729
 8 | 131072 49.500582
 9 | 524288 192.086775
10 | 2097152 785.861873
11 | 8388608 3061.065027
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/Julia/EnsembleGPUArray/Julia_EnGPUArray_times_unadaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 99.183004
 2 | 32 100.362064
 3 | 128 100.833487
 4 | 512 122.938753
 5 | 2048 122.718199
 6 | 8192 130.923533
 7 | 32768 134.314086
 8 | 131072 304.430394
 9 | 524288 1148.009043
10 | 2097152 4150.856981
11 | 8388608 15826.752605
12 | 


--------------------------------------------------------------------------------
/runner_scripts/gpu/run_ode_jax.sh:
--------------------------------------------------------------------------------
 1 | a=8
 2 | max_a=$1
 3 | XLA_PYTHON_CLIENT_PREALLOCATE=false
 4 | while [ $a -le $max_a ]
 5 | do
 6 |     	# Print the values
 7 |     	echo "No. of trajectories = $a"
 8 | 		python3 ./GPU_ODE_JAX/bench_diffrax.py $a	
 9 |     	# increment the value
10 |     	a=$((a*4))
11 | done
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/RTX_5000/JAX/Jax_times_adaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 11.396918998798355
 2 | 32 11.75047601282131
 3 | 128 12.41002899769228
 4 | 512 12.64855700719636
 5 | 2048 12.49069900950417
 6 | 8192 14.121492000413127
 7 | 32768 21.169946994632483
 8 | 131072 206.5049299999373
 9 | 524288 454.2432949965587
10 | 2097152 1770.2114550047554
11 | 8388608 7067.879137990531
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/RTX_5000/JAX/Jax_times_unadaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 46.13151300873142
 2 | 32 44.98972499277443
 3 | 128 44.53597999236081
 4 | 512 44.87656700075604
 5 | 2048 45.046036000712775
 6 | 8192 44.75162898597773
 7 | 32768 80.44722399790771
 8 | 131072 552.1373499941546
 9 | 524288 2157.217366999248
10 | 2097152 8489.217234004172
11 | 8388608 33885.78143600898
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/RTX_5000/PyTorch/Torch_times_unadaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 609.113594000064
 2 | 32 627.9670649998934
 3 | 128 637.831831999847
 4 | 512 634.4590839999
 5 | 2048 624.9206489999324
 6 | 8192 623.1320730000789
 7 | 32768 615.158344000065
 8 | 131072 644.2248739999741
 9 | 524288 2594.314949999898
10 | 2097152 10222.998637000046
11 | 8388608 40889.64921499996
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/Tesla_V100/JAX/Jax_times_unadaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 60.93443697318435
 2 | 32 61.113049974665046
 3 | 128 63.0807199049741
 4 | 512 67.59428302757442
 5 | 2048 67.74979480542243
 6 | 8192 70.7671670243144
 7 | 32768 82.74353202432394
 8 | 131072 197.92419183067977
 9 | 524288 730.48299504444
10 | 2097152 2684.088410111144
11 | 8388608 10038.143360987306
12 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/Tesla_V100/JAX/Jax_times_adaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 12.42110994644463
 2 | 32 13.530798023566604
 3 | 128 14.264097204431891
 4 | 512 14.479244127869606
 5 | 2048 15.887931920588017
 6 | 8192 15.419425908476114
 7 | 32768 19.164706114679575
 8 | 131072 47.79849713668227
 9 | 524288 190.3297039680183
10 | 2097152 680.2465228829533
11 | 8388608 2610.883393092081
12 | 


--------------------------------------------------------------------------------
/test_DiffEqGPU.jl:
--------------------------------------------------------------------------------
 1 | using TestEnv
 2 | using Pkg
 3 | 
 4 | Pkg.add("DiffEqGPU")
 5 | TestEnv.activate("DiffEqGPU")
 6 | backend = ARGS[1]
 7 | ENV["GROUP"] = backend
 8 | Pkg.add(backend)
 9 | 
10 | ENV["JULIA_LOAD_PATH"]=dirname(Base.active_project())
11 | 
12 | using DiffEqGPU
13 | include(joinpath(dirname(pathof(DiffEqGPU)), "..", "test", "runtests.jl"))
14 | 


--------------------------------------------------------------------------------
/paper_artifacts/data/Tesla_V100/PyTorch/Torch_times_unadaptive.txt:
--------------------------------------------------------------------------------
 1 | 8 775.3896450158209
 2 | 32 1098.3309479197487
 3 | 128 794.1124310018495
 4 | 512 774.6156470384449
 5 | 2048 781.6141300136223
 6 | 8192 781.7080520326272
 7 | 32768 785.07510793861
 8 | 131072 790.6077449442819
 9 | 524288 1086.9193120161071
10 | 2097152 3784.109622007236
11 | 8388608 14353.03207905963
12 | 


--------------------------------------------------------------------------------
/Project.toml:
--------------------------------------------------------------------------------
1 | [deps]
2 | Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
3 | DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
4 | LaTeXStrings = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
5 | Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
6 | Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
7 | StatsPlots = "f3b207a7-027a-5e70-b257-86293d7955fd"
8 | TestEnv = "1e6cf692-eddd-4d53-88a5-2d735e33781b"
9 | 


--------------------------------------------------------------------------------
/GPU_ODE_MPGOS/makefile:
--------------------------------------------------------------------------------
 1 | ROOT_DIR = $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 2 | INCL_DIR = -I$(ROOT_DIR)/SourceCodes
 3 | CMPL_OPT = -O3 -std=c++11 --ptxas-options=-v --gpu-architecture=sm_70 -lineinfo -maxrregcount=128
 4 | SOURCE   = Lorenz.cu
 5 | 
 6 | all: Lorenz.exe
 7 | 
 8 | Lorenz.exe: $(SOURCE)
 9 | 	nvcc -o	Lorenz.exe $(SOURCE) $(INCL_DIR) $(CMPL_OPT)
10 | 
11 | clean:
12 | 	rm -f Lorenz.exe
13 | 


--------------------------------------------------------------------------------
/runner_scripts/cpu/run_ode_julia.sh:
--------------------------------------------------------------------------------
 1 | a=8
 2 | max_a=$1
 3 | 
 4 | path="CPU"
 5 | if [ -d "./data/${path}" ] 
 6 | then
 7 | 	rm -rf "./data/${path}"
 8 | 	mkdir -p "./data/${path}"
 9 | else
10 | 	mkdir -p "./data/${path}"
11 | fi
12 | 
13 | while [ $a -le $max_a ]
14 | do
15 |     	# Print the values
16 |     	echo $a
17 | 		julia --threads=16 --project="./GPU_ODE_Julia/" ./GPU_ODE_Julia/bench_cpu.jl $a
18 |     	# increment the value
19 |     	a=$((a*4))
20 | done
21 | 


--------------------------------------------------------------------------------
/runner_scripts/gpu/run_egarray_julia.sh:
--------------------------------------------------------------------------------
 1 | a=8
 2 | max_a=$1
 3 | path="EnsembleGPUArray"
 4 | if [ -d "./data/${path}" ] 
 5 | then
 6 | 	rm -rf "./data/${path}"
 7 | 	mkdir -p "./data/${path}"
 8 | else
 9 | 	mkdir -p "./data/${path}"
10 | fi
11 | while [ $a -le $max_a ]
12 | do
13 |     	# Print the values
14 |     	echo $a
15 | 		julia --project="./GPU_ODE_Julia/" ./GPU_ODE_Julia/bench_ensemblegpuarray.jl $a
16 |     	# increment the value
17 |     	a=$((a*4))
18 | done
19 | 


--------------------------------------------------------------------------------
/MPI/mpi_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Slurm Sbatch Options
 3 | #SBATCH --gres=gpu:volta:1
 4 | #SBATCH -n 5 -N 5
 5 | #SBATCH --output="./mpi_scatter_test.log-%j"
 6 | # Loading the required module
 7 | 
 8 | export JULIA_CUDA_MEMORY_POOL=none
 9 | export JULIA_MPI_BINARY=system
10 | export JULIA_CUDA_USE_BINARYBUILDER=false
11 | 
12 | source $HOME/.bashrc
13 | module load cuda/11.6 mpi/openmpi-4.1.3
14 | 
15 | srun hostname > hostfile
16 | #script
17 | time mpiexec julia gpu_ode_mpi.jl
18 | 


--------------------------------------------------------------------------------
/runner_scripts/gpu/run_sde_julia.sh:
--------------------------------------------------------------------------------
 1 | a=8
 2 | max_a=$1
 3 | 
 4 | path="SDE"
 5 | if [ -d "./data/${path}" ] 
 6 | then
 7 | 	rm -rf "./data/${path}"/* || true
 8 | 	mkdir -p "./data/${path}"
 9 | else
10 | 	mkdir -p "./data/${path}"
11 | fi
12 | 
13 | 
14 | while [ $a -le $max_a ]
15 | do
16 |     	# Print the values
17 |     	echo $a
18 | 		julia --threads=16 --project="./GPU_ODE_Julia/" ./GPU_ODE_Julia/sde_examples/bench_gpu.jl $a
19 |     	# increment the value
20 |     	a=$((a*4))
21 | done
22 | 


--------------------------------------------------------------------------------
/runner_scripts/cpu/run_sde_julia.sh:
--------------------------------------------------------------------------------
 1 | a=8
 2 | max_a=$1
 3 | 
 4 | path="CPU"
 5 | if [ -d "./data/${path}/SDE" ] 
 6 | then
 7 | 	rm -f "./data/${path}/SDE"/* || true
 8 | 	mkdir -p "./data/${path}/SDE"
 9 | else
10 | 	mkdir -p "./data/${path}/SDE"
11 | fi
12 | 
13 | while [ $a -le $max_a ]
14 | do
15 |     	# Print the values
16 |     	echo $a
17 | 		julia --threads=16 --project="./GPU_ODE_Julia/" ./GPU_ODE_Julia/sde_examples/bench_cpu.jl $a
18 |     	# increment the value
19 |     	a=$((a*4))
20 | done
21 | 


--------------------------------------------------------------------------------
/runner_scripts/gpu/run_ode_mult_device.sh:
--------------------------------------------------------------------------------
 1 | a=8
 2 | max_a=$1
 3 | backend=$2
 4 | if [ -d "./data/devices/${backend}" ] 
 5 | then
 6 | 	rm -rf "./data/devices/${backend}"
 7 | 	mkdir -p "./data/devices/${backend}"
 8 | else
 9 | 	mkdir -p "./data/devices/${backend}"
10 | fi
11 | 
12 | while [ $a -le $max_a ]
13 | do
14 |     	# Print the values
15 |     	echo $a
16 | 		julia --project="./GPU_ODE_Julia/" ./GPU_ODE_Julia/bench_multi_device.jl $a $backend
17 |     	# increment the value
18 |     	a=$((a*4))
19 | done
20 | 


--------------------------------------------------------------------------------
/texture_memory/Project.toml:
--------------------------------------------------------------------------------
 1 | [deps]
 2 | Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 3 | CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 4 | DataInterpolations = "82cc6244-b520-54b8-b5a6-8a565e85f1d0"
 5 | DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea"
 6 | Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 7 | LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 8 | OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 9 | Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
10 | Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
11 | StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
12 | 


--------------------------------------------------------------------------------
/runner_scripts/gpu/run_sde_crn.sh:
--------------------------------------------------------------------------------
 1 | a=2
 2 | max_a=4
 3 | 
 4 | path="SDE"
 5 | if [ -d "./data/${path}/CRN" ] 
 6 | then
 7 | 	rm -rf "./data/${path}/CRN"
 8 | 	mkdir -p "./data/${path}/CRN"
 9 | 
10 | 	rm -rf "./data/CPU/${path}/CRN"
11 | 	mkdir -p "./data/CPU/${path}/CRN"
12 | else
13 | 	mkdir -p "./data/${path}/CRN"
14 | 	mkdir -p "./data/CPU/${path}/CRN"
15 | fi
16 | 
17 | while [ $a -le $max_a ]
18 | do
19 |     	# Print the values
20 |     	echo $a
21 | 		julia --threads=16 --project="./GPU_ODE_Julia/" ./GPU_ODE_Julia/sde_examples/bench_crn_model.jl $a
22 |     	# increment the value
23 |     	a=$((a*2))
24 | done
25 | 


--------------------------------------------------------------------------------
/runner_scripts/gpu/run_ode_cpp.sh:
--------------------------------------------------------------------------------
 1 | a=8
 2 | # max_a=$((2**24))
 3 | max_a=$1
 4 | while [ $a -le $max_a ]
 5 | do
 6 |     echo $a
 7 | 	sed -i "15d" ./GPU_ODE_MPGOS/Lorenz.cu
 8 | 	sed -i "15 i #define SOLVER RK4" ./GPU_ODE_MPGOS/Lorenz.cu
 9 | 	sed -i "17d" ./GPU_ODE_MPGOS/Lorenz.cu
10 | 	sed -i "17 i const int NT = $a;" ./GPU_ODE_MPGOS/Lorenz.cu
11 | 
12 | 	make clean --directory=./GPU_ODE_MPGOS/
13 | 	make --directory=./GPU_ODE_MPGOS/
14 | 	./GPU_ODE_MPGOS/Lorenz.exe $a
15 | 
16 | 	sed -i "15d" ./GPU_ODE_MPGOS/Lorenz.cu
17 | 	sed -i "15 i #define SOLVER RKCK45" ./GPU_ODE_MPGOS/Lorenz.cu
18 | 
19 | 	make clean --directory=./GPU_ODE_MPGOS/
20 | 	make --directory=./GPU_ODE_MPGOS/
21 | 	./GPU_ODE_MPGOS/Lorenz.exe $a
22 | 	# increment the value
23 | 	a=$((a*4))
24 | done
25 | 


--------------------------------------------------------------------------------
/GPU_ODE_Julia/Project.toml:
--------------------------------------------------------------------------------
 1 | name = "GPU_ODE_Julia"
 2 | uuid = "d770f587-beb8-456f-87bf-3eef33441b01"
 3 | authors = ["Utkarsh <rajpututkarsh530@gmail.com>"]
 4 | version = "0.1.0"
 5 | 
 6 | [deps]
 7 | BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 8 | Catalyst = "479239e8-5488-4da2-87a7-35f2df7eef83"
 9 | DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e"
10 | DiffEqDevTools = "f3b72e0c-5b89-59e1-b016-84e28bfd966d"
11 | DiffEqGPU = "071ae1c0-96b5-11e9-1965-c90190d839ea"
12 | ModelingToolkit = "961ee093-0014-501f-94e3-6117800e7a78"
13 | OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
14 | ReactionNetworkImporters = "b4db0fb7-de2a-5028-82bf-5021f5cfa881"
15 | SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
16 | SimpleDiffEq = "05bca326-078c-5bf0-a5bf-ce7c7982d7fd"
17 | StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
18 | StochasticDiffEq = "789caeaf-c7a9-5a7d-9973-96adeb23e2a0"
19 | 


--------------------------------------------------------------------------------
/GPU_ODE_Julia/bin/ode_problems/linear.jl:
--------------------------------------------------------------------------------
 1 | using Random
 2 | Random.seed!(123)
 3 | 
 4 | # 1D Linear ODE
 5 | function f(u::AbstractArray{T}, p, t::T) where {T}
 6 |     return T(1.01) * u
 7 | end
 8 | function f_analytic(u₀, p, t)
 9 |     u₀ * exp(1.01 * t)
10 | end
11 | 
12 | tspan = (0.0, 10.0)
13 | tspan = T.(tspan)
14 | u0 = @SVector rand(T, 100)
15 | prob = ODEProblem(ODEFunction(f, analytic = f_analytic), u0, tspan)
16 | 
17 | ensembleProb = EnsembleProblem(prob)
18 | 
19 | ### Lower level API ####
20 | 
21 | ## Building problems here only
22 | I = 1:numberOfParameters
23 | if ensembleProb.safetycopy
24 |     probs = map(I) do i
25 |         ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1)
26 |     end
27 | else
28 |     probs = map(I) do i
29 |         ensembleProb.prob_func(ensembleProb.prob, i, 1)
30 |     end
31 | end
32 | 
33 | ## Make them compatible with CUDA
34 | probs = cu(probs)
35 | dt = T(0.1)
36 | 


--------------------------------------------------------------------------------
/GPU_ODE_Julia/bin/ode_problems/multistate.jl:
--------------------------------------------------------------------------------
 1 | using ReactionNetworkImporters, Catalyst
 2 | 
 3 | prnbng = loadrxnetwork(BNGNetwork(), joinpath(dirname(@__DIR__), "Models/multistate.net"))
 4 | 
 5 | rn = prnbng.rn
 6 | obs = [eq.lhs for eq in observed(rn)]
 7 | 
 8 | osys = convert(ODESystem, rn)
 9 | 
10 | tf = 20.0
11 | tspan = (0.0, tf)
12 | oprob = ODEProblem{false}(osys, T[], tspan, T[])
13 | 
14 | prob = make_gpu_compatible(oprob, Val(T))
15 | 
16 | @assert prob.f(prob.u0, prob.p, T(1.0f0)) isa StaticArray{<:Tuple, T}
17 | 
18 | ensembleProb = EnsembleProblem(prob)
19 | 
20 | ### Lower level API ####
21 | 
22 | ## Building problems here only
23 | I = 1:numberOfParameters
24 | if ensembleProb.safetycopy
25 |     probs = map(I) do i
26 |         ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1)
27 |     end
28 | else
29 |     probs = map(I) do i
30 |         ensembleProb.prob_func(ensembleProb.prob, i, 1)
31 |     end
32 | end
33 | 
34 | ## Make them compatible with CUDA
35 | probs = cu(probs)
36 | dt = T(0.001)
37 | 


--------------------------------------------------------------------------------
/GPU_ODE_JAX/requirements.txt:
--------------------------------------------------------------------------------
 1 | asttokens==2.2.1
 2 | backcall==0.2.0
 3 | backports.functools-lru-cache==1.6.4
 4 | comm==0.1.2
 5 | debugpy==1.6.6
 6 | decorator==5.1.1
 7 | diffrax==0.3.1
 8 | equinox==0.10.1
 9 | executing==1.2.0
10 | importlib-metadata==6.1.0
11 | ipykernel==6.19.2
12 | ipython==8.11.0
13 | jax==0.4.6
14 | jaxlib==0.4.6+cuda11.cudnn82
15 | jaxtyping==0.2.14
16 | jedi==0.18.2
17 | jupyter_client==8.1.0
18 | jupyter_core==5.3.0
19 | matplotlib-inline==0.1.6
20 | nest-asyncio==1.5.6
21 | numpy==1.24.2
22 | opt-einsum==3.3.0
23 | packaging==23.0
24 | parso==0.8.3
25 | pexpect==4.8.0
26 | pickleshare==0.7.5
27 | pip==23.0.1
28 | platformdirs==3.1.1
29 | prompt-toolkit==3.0.38
30 | psutil==5.9.4
31 | ptyprocess==0.7.0
32 | pure-eval==0.2.2
33 | Pygments==2.14.0
34 | python-dateutil==2.8.2
35 | pyzmq==25.0.2
36 | scipy==1.10.1
37 | setuptools==67.6.0
38 | six==1.16.0
39 | stack-data==0.6.2
40 | tornado==6.2
41 | traitlets==5.9.0
42 | typeguard==3.0.1
43 | typing_extensions==4.5.0
44 | wcwidth==0.2.6
45 | wheel==0.40.0
46 | zipp==3.15.0
47 | 


--------------------------------------------------------------------------------
/GPU_ODE_Julia/src/GPU_ODE_Julia.jl:
--------------------------------------------------------------------------------
 1 | module GPU_ODE_Julia
 2 | using ModelingToolkit, StaticArrays, SciMLBase
 3 | using DiffEqGPU
 4 | 
 5 | function make_gpu_compatible(prob::T, ::Val{T1}) where {T <: ODEProblem, T1}
 6 |     sys = modelingtoolkitize(prob)
 7 |     prob = ODEProblem{false}(sys)
 8 |     remake(prob; u0 = SArray{Tuple{length(prob.u0)}, T1}(prob.u0),
 9 |            tspan = T1.(prob.tspan),
10 |            p = prob.p isa SciMLBase.NullParameters ? prob.p :
11 |                SArray{Tuple{length(prob.p)}, T1}(prob.p))
12 | end
13 | 
14 | struct GPUODE{T <: DiffEqGPU.GPUODEAlgorithm} <: SciMLBase.AbstractODEAlgorithm
15 |     trajectories::Int
16 | end
17 | 
18 | ## Wrapping for compat with WorkPrecisionSet
19 | function SciMLBase.__solve(prob::SciMLBase.AbstractODEProblem, alg::GPUODE{T}, args...;
20 |                            kwargs...) where {T}
21 |     eprob = EnsembleProblem(prob)
22 |     sol = solve(eprob, T(), EnsembleGPUKernel(0.0), trajectories = alg.trajectories;
23 |                 kwargs...)
24 |     return sol[1]
25 | end
26 | 
27 | export make_gpu_compatible, GPUODE
28 | 
29 | end # module GPU_ODE_Julia
30 | 


--------------------------------------------------------------------------------
/GPU_ODE_Julia/bin/ode_problems/lorenz.jl:
--------------------------------------------------------------------------------
 1 | function lorenz(u::AbstractArray{T}, p, t) where {T}
 2 |     du1 = T(10.0) * (u[2] - u[1])
 3 |     du2 = p[1] * u[1] - u[2] - u[1] * u[3]
 4 |     du3 = u[1] * u[2] - T(8 // 3) * u[3]
 5 |     return @SVector T[du1, du2, du3]
 6 | end
 7 | 
 8 | u0 = @SVector T[1.0f0; 0.0f0; 0.0f0]
 9 | tspan = (T(0.0), T(1.0))
10 | p = @SArray T[28.0]
11 | prob = ODEProblem(lorenz, u0, tspan, p)
12 | 
13 | parameterList = range(T(0.0), stop = T(21.0), length = numberOfParameters)
14 | 
15 | lorenzProblem = ODEProblem(lorenz, u0, tspan, p)
16 | prob_func = (prob, i, repeat) -> remake(prob, p = @SArray [parameterList[i]])
17 | 
18 | ensembleProb = EnsembleProblem(lorenzProblem, prob_func = prob_func)
19 | 
20 | ## Building problems here only
21 | I = 1:numberOfParameters
22 | if ensembleProb.safetycopy
23 |     probs = map(I) do i
24 |         ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1)
25 |     end
26 | else
27 |     probs = map(I) do i
28 |         ensembleProb.prob_func(ensembleProb.prob, i, 1)
29 |     end
30 | end
31 | 
32 | ## Make them compatible with CUDA
33 | probs = cu(probs)
34 | dt = T(0.001)
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Utkarsh
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/GPU_ODE_MPGOS/ProfileSpec.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | echo "Bash version ${BASH_VERSION}..."
 3 | 
 4 | FileName=$2
 5 | LogFileName=$FileName'.log'
 6 | ProfFileName=$FileName'.nvprof'
 7 | 
 8 | rm -f $LogFileName
 9 | 
10 | echo "--- SUMMARY ---" >> $LogFileName
11 | echo >> $LogFileName
12 | 
13 | nvprof --unified-memory-profiling off --profile-api-trace none ./$1 2>>$LogFileName
14 | 
15 | echo >> $LogFileName
16 | echo >> $LogFileName
17 | 
18 | 
19 | echo "--- SPECIFIC METRICS AND EVENTS ---" >> $LogFileName
20 | echo >> $LogFileName
21 | 
22 | nvprof --unified-memory-profiling off --kernels :::1 --events elapsed_cycles_sm,active_cycles --metrics sm_efficiency,achieved_occupancy,eligible_warps_per_cycle,branch_efficiency,local_load_throughput,local_store_throughput,ipc,issued_ipc,flop_count_dp_add,flop_count_dp_mul,flop_count_dp_fma,inst_integer,inst_control,inst_compute_ld_st,inst_misc,flop_dp_efficiency,l1_shared_utilization,l2_utilization,dram_utilization,ldst_fu_utilization,alu_fu_utilization,stall_pipe_busy,stall_exec_dependency,stall_memory_dependency,stall_inst_fetch,stall_not_selected,stall_memory_throttle,stall_other ./$1 2>>$LogFileName
23 | 
24 | echo >> $LogFileName
25 | echo >> $LogFileName


--------------------------------------------------------------------------------
/GPU_ODE_Julia/sde_examples/bench_cpu.jl:
--------------------------------------------------------------------------------
 1 | using DiffEqGPU, BenchmarkTools, StaticArrays, StochasticDiffEq
 2 | 
 3 | @show ARGS
 4 | #settings
 5 | 
 6 | numberOfParameters = isinteractive() ? 8192 : parse(Int64, ARGS[1])
 7 | 
 8 | # Defining the Problem
 9 | # dX = pudt + qudW
10 | u₀ = SA[0.1, 0.1, 0.1]
11 | f(u, p, t) = p[1] * u
12 | g(u, p, t) = p[2] * u
13 | tspan = (0.00, 1.0)
14 | p = SA[1.5, 0.01]
15 | 
16 | prob = SDEProblem(f, g, u₀, tspan, p; seed = 1234)
17 | 
18 | ensembleProb = EnsembleProblem(prob)
19 | 
20 | @info "Solving the problem"
21 | 
22 | I = 1:numberOfParameters
23 | if ensembleProb.safetycopy
24 |     probs = map(I) do i
25 |         ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1)
26 |     end
27 | else
28 |     probs = map(I) do i
29 |         ensembleProb.prob_func(ensembleProb.prob, i, 1)
30 |     end
31 | end
32 | 
33 | data = @benchmark solve($ensembleProb, EM(), EnsembleThreads(), dt = Float64(1 // 2^8),
34 |                         adaptive = false, save_everystep = false,
35 |                         trajectories = numberOfParameters)
36 | 
37 | if !isinteractive()
38 |     open(joinpath(dirname(dirname(@__DIR__)), "data", "CPU/SDE/times_unadaptive.txt"),
39 |          "a+") do io
40 |         println(io, numberOfParameters, " ", minimum(data.times) / 1e6)
41 |     end
42 | end
43 | 
44 | println("Parameter number: " * string(numberOfParameters))
45 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms")
46 | println("Allocs: " * string(data.allocs))
47 | 


--------------------------------------------------------------------------------
/GPU_ODE_Julia/bin/catalyst_models/multistate.net:
--------------------------------------------------------------------------------
 1 | # Created by BioNetGen 2.7.0
 2 | begin parameters
 3 |     1 R0           5360  # Constant
 4 |     2 L0           1160  # Constant
 5 |     3 A0           5360  # Constant
 6 |     4 kon          0.01  # Constant
 7 |     5 koff         0.1  # Constant
 8 |     6 kAon         0.01  # Constant
 9 |     7 kAoff        0.1  # Constant
10 |     8 kAp          0.01  # Constant
11 |     9 kAdp         0.1  # Constant
12 | end parameters
13 | begin species
14 |     1 R(a,l) R0
15 |     2 L(r) L0
16 |     3 A(Y~U,r) A0
17 |     4 L(r!1).R(a,l!1) 0
18 |     5 A(Y~U,r!1).R(a!1,l) 0
19 |     6 A(Y~U,r!1).L(r!2).R(a!1,l!2) 0
20 |     7 A(Y~P,r!1).L(r!2).R(a!1,l!2) 0
21 |     8 A(Y~P,r!1).R(a!1,l) 0
22 |     9 A(Y~P,r) 0
23 | end species
24 | begin reactions
25 |     1 1,2 4 kon #_R1
26 |     2 1,3 5 kAon #_R2
27 |     3 2,5 6 kon #_R1
28 |     4 4 1,2 koff #_reverse__R1
29 |     5 3,4 6 kAon #_R2
30 |     6 5 1,3 kAoff #_reverse__R2
31 |     7 6 2,5 koff #_reverse__R1
32 |     8 6 3,4 kAoff #_reverse__R2
33 |     9 6 7 kAp #_R3
34 |    10 7 2,8 koff #_reverse__R1
35 |    11 7 4,9 kAoff #_reverse__R2
36 |    12 7 6 kAdp #_R4
37 |    13 2,8 7 kon #_R1
38 |    14 1,9 8 kAon #_R2
39 |    15 4,9 7 kAon #_R2
40 |    16 8 1,9 kAoff #_reverse__R2
41 |    17 8 5 kAdp #_R4
42 |    18 9 3 kAdp #_R4
43 | end reactions
44 | begin groups
45 |     1 A_P                  7,8,9
46 |     2 A_unbound_P          9
47 |     3 A_bound_P            7,8
48 |     4 RLA_P                7
49 | end groups
50 | 


--------------------------------------------------------------------------------
/runner_scripts/plot/plot_sde_comp.jl:
--------------------------------------------------------------------------------
 1 | using Plots
 2 | using DelimitedFiles
 3 | using Dates
 4 | using Statistics
 5 | using LaTeXStrings
 6 | using StatsPlots
 7 | 
 8 | times = Dict()
 9 | 
10 | parent_dir =
11 |     length(ARGS) != 0 ? joinpath(ARGS[1], "data") :
12 |     joinpath("paper_artifacts", "data", "Julia")
13 | 
14 | base_path = joinpath(dirname(dirname(@__DIR__)), parent_dir)
15 | 
16 | 
17 | Julia_data = readdlm(joinpath(base_path, "SDE", "Julia_times_unadaptive.txt"))
18 | 
19 | GPU_times = Julia_data[:, 2] .* 1e-3
20 | Ns = Julia_data[:, 1]
21 | 
22 | CPU_data = readdlm(joinpath(base_path, "CPU", "SDE", "times_unadaptive.txt"))
23 | 
24 | CPU_times = CPU_data[:, 2] .* 1e-3
25 | 
26 | times["Fixed_CPU"] = mean(CPU_times)
27 | 
28 | times["Fixed_GPU"] = mean(GPU_times)
29 | 
30 | 
31 | xticks = 10 .^ round.(range(1, 7, length = 10), digits = 2)
32 | 
33 | yticks = 10 .^ round.(range(1, -6, length = 15), digits = 2)
34 | 
35 | 
36 | 
37 | plt = groupedbar(
38 |     log10.(Ns),
39 |     [GPU_times CPU_times],
40 |     labels = ["GPU: Float32" "CPU: Float64"],
41 |     yaxis = :log,
42 |     yticks = yticks,
43 |     ylabel = "Time (s)",
44 |     xlabel = "Trajectories (" * L"$10^n$" * ")",
45 |     legend = :topleft,
46 |     title = "Performance Comparison of solving SDEs \n between CPU and GPU",
47 |     dpi = 600,
48 | )
49 | 
50 | plots_dir = joinpath(dirname(dirname(@__DIR__)), "plots")
51 | 
52 | isdir(plots_dir) || mkdir(plots_dir)
53 | 
54 | 
55 | savefig(plt, joinpath(plots_dir, "CPU_SDE_$(Dates.value(Dates.now())).png"))
56 | 


--------------------------------------------------------------------------------
/GPU_ODE_Julia/bin/ode_problems/multisite2.jl:
--------------------------------------------------------------------------------
 1 | using GPU_ODE_Julia
 2 | using DiffEqGPU, BenchmarkTools, StaticArrays, SimpleDiffEq, ReactionNetworkImporters,
 3 |       Catalyst
 4 | using CUDA
 5 | 
 6 | @show ARGS
 7 | #settings
 8 | 
 9 | numberOfParameters = isinteractive() ? 2 : parse(Int64, ARGS[1])
10 | gpuID = 0
11 | 
12 | device!(CuDevice(gpuID))
13 | println("Running on " * string(CuDevice(gpuID)))
14 | 
15 | prnbng = loadrxnetwork(BNGNetwork(), joinpath(@__DIR__, "Models/multisite2.net"))
16 | 
17 | rn = prnbng.rn
18 | obs = [eq.lhs for eq in observed(rn)]
19 | 
20 | osys = convert(ODESystem, rn)
21 | 
22 | tf = 2.0
23 | tspan = (0.0, tf)
24 | oprob = ODEProblem{false}(osys, Float64[], tspan, Float64[])
25 | 
26 | T = Float64
27 | 
28 | prob = make_gpu_compatible(oprob, Val(T))
29 | 
30 | @assert prob.f(prob.u0, prob.p, T(1.0f0)) isa StaticArray{<:Tuple, T}
31 | 
32 | ensembleProb = EnsembleProblem(prob)
33 | 
34 | sol = solve(ensembleProb, GPUTsit5(), EnsembleGPUKernel(), trajectories = 2, dt = 0.001f0)
35 | 
36 | ### Lower level API ####
37 | 
38 | ## Building problems here only
39 | I = 1:numberOfParameters
40 | if ensembleProb.safetycopy
41 |     probs = map(I) do i
42 |         ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1)
43 |     end
44 | else
45 |     probs = map(I) do i
46 |         ensembleProb.prob_func(ensembleProb.prob, i, 1)
47 |     end
48 | end
49 | 
50 | ## Make them compatible with CUDA
51 | probs = cu(probs)
52 | 
53 | @info "Solving the problem"
54 | sol = @time CUDA.@sync DiffEqGPU.vectorized_asolve(probs, ensembleProb.prob, GPUTsit5();
55 |                                                    save_everystep = false, dt = T(0.001))
56 | 


--------------------------------------------------------------------------------
/GPU_ODE_PyTorch/requirements.txt:
--------------------------------------------------------------------------------
 1 | appdirs==1.4.4
 2 | asttokens==2.2.1
 3 | backcall==0.2.0
 4 | backports.functools-lru-cache==1.6.4
 5 | certifi==2022.12.7
 6 | charset-normalizer==3.0.1
 7 | cmake==3.25.0
 8 | comm==0.1.2
 9 | debugpy==1.6.6
10 | decorator==5.1.1
11 | executing==1.2.0
12 | filelock==3.9.0
13 | functorch==1.13.0
14 | idna==3.4
15 | importlib-metadata==6.0.0
16 | ipykernel==6.19.2
17 | ipython==8.10.0
18 | jedi==0.18.2
19 | jupyter_client==8.0.3
20 | jupyter_core==5.2.0
21 | Mako==1.2.4
22 | MarkupSafe==2.1.2
23 | matplotlib-inline==0.1.6
24 | mkl-fft==1.3.1
25 | mkl-random==1.2.2
26 | mkl-service==2.4.0
27 | mpmath==1.2.1
28 | nest-asyncio==1.5.6
29 | networkx==3.0rc1
30 | numpy==1.23.4
31 | nvidia-cublas-cu11==11.10.3.66
32 | nvidia-cuda-nvrtc-cu11==11.7.99
33 | nvidia-cuda-runtime-cu11==11.7.99
34 | nvidia-cudnn-cu11==8.5.0.96
35 | packaging==23.0
36 | parso==0.8.3
37 | pexpect==4.8.0
38 | pickleshare==0.7.5
39 | Pillow==9.4.0
40 | pip==22.2.2
41 | platformdirs==3.0.0
42 | prompt-toolkit==3.0.36
43 | psutil==5.9.4
44 | ptyprocess==0.7.0
45 | pure-eval==0.2.2
46 | pycuda==2022.2.2
47 | Pygments==2.14.0
48 | python-dateutil==2.8.2
49 | pytools==2022.1.14
50 | pytorch-triton==2.0.0+0d7e753227
51 | pyzmq==25.0.0
52 | requests==2.28.2
53 | scipy==1.10.1
54 | setuptools==67.4.0
55 | six==1.16.0
56 | stack-data==0.6.2
57 | sympy==1.11.1
58 | torch==2.0.0.dev20230202+cu116
59 | torchaudio==2.0.0.dev20230201+cu116
60 | torchdiffeq==0.2.3
61 | torchode==0.1.1.post1
62 | torchtyping==0.1.4
63 | torchvision==0.15.0.dev20230201+cu116
64 | tornado==6.2
65 | traitlets==5.9.0
66 | typeguard==2.13.3
67 | typing_extensions==4.4.0
68 | urllib3==1.26.14
69 | wcwidth==0.2.6
70 | wheel==0.38.4
71 | zipp==3.15.0
72 | 


--------------------------------------------------------------------------------
/GPU_ODE_MPGOS/SourceCodes/SingleSystem_PerThread_EventHandling.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef SINGLESYSTEM_PERTHREAD_EVENTHANDLING_H
 2 | #define SINGLESYSTEM_PERTHREAD_EVENTHANDLING_H
 3 | 
 4 | 
 5 | template <int NE, class Precision>
 6 | __forceinline__ __device__ void PerThread_EventTimeStepControl(\
 7 | 			int        tid, \
 8 | 			int&       r_UpdateStep, \
 9 | 			int        r_TerminateSimulation, \
10 | 			Precision* r_ActualEventValue, \
11 | 			Precision* r_NextEventValue, \
12 | 			Precision* s_EventTolerance, \
13 | 			int*       s_EventDirection, \
14 | 			Precision  r_TimeStep, \
15 | 			Precision& r_NewTimeStep, \
16 | 			Precision  MinimumTimeStep)
17 | {
18 | 	Precision EventTimeStep = r_TimeStep;
19 | 	int       IsCorrected   = 0;
20 | 	
21 | 	if ( ( r_UpdateStep == 1 ) && ( r_TerminateSimulation == 0 ) )
22 | 	{
23 | 		for (int i=0; i<NE; i++)
24 | 		{
25 | 			if ( ( ( r_ActualEventValue[i] >  s_EventTolerance[i] ) && ( r_NextEventValue[i] < -s_EventTolerance[i] ) && ( s_EventDirection[i] <= 0 ) ) || \
26 | 				 ( ( r_ActualEventValue[i] < -s_EventTolerance[i] ) && ( r_NextEventValue[i] >  s_EventTolerance[i] ) && ( s_EventDirection[i] >= 0 ) ) )
27 | 			{
28 | 				EventTimeStep = MPGOS::FMIN( EventTimeStep, -r_ActualEventValue[i] / (r_NextEventValue[i]-r_ActualEventValue[i]) * r_TimeStep );
29 | 				IsCorrected   = 1;
30 | 			}
31 | 		}
32 | 	}
33 | 	
34 | 	if ( IsCorrected == 1 )
35 | 	{
36 | 		if ( EventTimeStep < MinimumTimeStep )
37 | 		{
38 | 			printf("Warning: Event cannot be detected without reducing the step size below the minimum! Event detection omitted!, (thread id: %d)\n", tid);
39 | 		} else
40 | 		{
41 | 			r_NewTimeStep = EventTimeStep;
42 | 			r_UpdateStep  = 0;
43 | 		}
44 | 	}
45 | }
46 | 
47 | #endif


--------------------------------------------------------------------------------
/GPU_ODE_Julia/sde_examples/bench_gpu.jl:
--------------------------------------------------------------------------------
 1 | using DiffEqGPU, DiffEqBase, StaticArrays, CUDA, BenchmarkTools
 2 | 
 3 | @show ARGS
 4 | #settings
 5 | 
 6 | numberOfParameters = isinteractive() ? 8192 : parse(Int64, ARGS[1])
 7 | 
 8 | # Defining the Problem
 9 | # dX = pudt + qudW
10 | u₀ = SA[0.1f0, 0.1f0, 0.1f0]
11 | f(u, p, t) = p[1] * u
12 | g(u, p, t) = p[2] * u
13 | tspan = (0.0f0, 1.0f0)
14 | p = SA[1.5f0, 0.01f0]
15 | 
16 | prob = SDEProblem(f, g, u₀, tspan, p; seed = 1234)
17 | 
18 | ensembleProb = EnsembleProblem(prob)
19 | 
20 | ## Building problem for each trajectories. Since we just want to generate different
21 | ## time-series, the problem remains same.
22 | I = 1:numberOfParameters
23 | if ensembleProb.safetycopy
24 |     probs = map(I) do i
25 |         ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1)
26 |     end
27 | else
28 |     probs = map(I) do i
29 |         ensembleProb.prob_func(ensembleProb.prob, i, 1)
30 |     end
31 | end
32 | 
33 | ## Move the arrays to the GPU
34 | probs = cu(probs);
35 | 
36 | ## Finally use the lower API for faster solves! (Fixed time-stepping)
37 | 
38 | data = @benchmark CUDA.@sync DiffEqGPU.vectorized_solve($probs, $prob, GPUEM();
39 |                                                         save_everystep = false,
40 |                                                         dt = Float32(1 // 2^8))
41 | 
42 | if !isinteractive()
43 |     open(joinpath(dirname(dirname(@__DIR__)), "data", "SDE", "Julia_times_unadaptive.txt"),
44 |          "a+") do io
45 |         println(io, numberOfParameters, " ", minimum(data.times) / 1e6)
46 |     end
47 | end
48 | 
49 | println("Parameter number: " * string(numberOfParameters))
50 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms")
51 | println("Allocs: " * string(data.allocs))
52 | 


--------------------------------------------------------------------------------
/GPU_ODE_Julia/bin/ode_problems/pleiades.jl:
--------------------------------------------------------------------------------
 1 | function f!(du::AbstractArray{T}, u::AbstractArray{T}, p, t::T) where {T}
 2 |     @inbounds begin
 3 |         x = view(u, 1:7)   # x
 4 |         y = view(u, 8:14)  # y
 5 |         v = view(u, 15:21) # x′
 6 |         w = view(u, 22:28) # y′
 7 |         du[1:7] .= v
 8 |         du[8:14] .= w
 9 |         for i in 15:28
10 |             du[i] = zero(u[1])
11 |         end
12 |         for i in 1:7, j in 1:7
13 |             if i != j
14 |                 r = ((x[i] - x[j])^(2.0f0) + (y[i] - y[j])^(2.0f0))^(3.0f0 / 2.0f0)
15 |                 du[14 + i] += j * (x[j] - x[i]) / r
16 |                 du[21 + i] += j * (y[j] - y[i]) / r
17 |             end
18 |         end
19 |     end
20 |     du = T.(du)
21 | end
22 | 
23 | u0 = T[3.0, 3.0, -1.0, -3.0, 2.0, -2.0, 2.0, 3.0, -3.0, 2.0, 0, 0, -4.0, 4.0, 0, 0, 0, 0, 0,
24 |        1.75, -1.5, 0, 0, 0, -1.25, 1, 0, 0]
25 | tspan = (0.0, 3.0)
26 | oprob = ODEProblem(f!, u0, T.(tspan))
27 | 
28 | prob = make_gpu_compatible(oprob, Val(T))
29 | 
30 | @assert prob.f(prob.u0, prob.p, T(1.0)) isa StaticArray{<:Tuple, T}
31 | 
32 | ensembleProb = EnsembleProblem(prob)
33 | dt = T(0.001)
34 | 
35 | # sol = solve(ensembleProb, GPUTsit5(), EnsembleGPUKernel(), trajectories = 2, dt = 1.0f0)
36 | 
37 | # ### Lower level API ####
38 | 
39 | ## Building problems here only
40 | I = 1:numberOfParameters
41 | if ensembleProb.safetycopy
42 |     probs = map(I) do i
43 |         ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1)
44 |     end
45 | else
46 |     probs = map(I) do i
47 |         ensembleProb.prob_func(ensembleProb.prob, i, 1)
48 |     end
49 | end
50 | 
51 | ## Make them compatible with CUDA
52 | probs = cu(probs)
53 | 
54 | # @info "Solving the problem"
55 | # sol = @time @CUDA.sync DiffEqGPU.vectorized_asolve(probs, ensembleProb.prob, GPUTsit5();
56 | # save_everystep = false, dt = 0.001f0)
57 | 


--------------------------------------------------------------------------------
/runner_scripts/plot/plot_mult_gpu.jl:
--------------------------------------------------------------------------------
 1 | using Plots
 2 | using DelimitedFiles
 3 | using Dates
 4 | using Statistics
 5 | using StatsPlots
 6 | using LaTeXStrings
 7 | 
 8 | times = Dict()
 9 | 
10 | parent_dir =
11 |     length(ARGS) != 0 ? joinpath(ARGS[1], "data") :
12 |     joinpath("paper_artifacts", "data", "Julia")
13 | 
14 | base_path = joinpath(dirname(dirname(@__DIR__)), parent_dir, "devices")
15 | 
16 | CUDA_data = readdlm(joinpath(base_path, "CUDA", "Julia_times_unadaptive.txt"))
17 | 
18 | CUDA_times = CUDA_data[:, 2] .* 1e-3
19 | Ns = CUDA_data[:, 1]
20 | 
21 | oneAPI_data = readdlm(joinpath(base_path, "oneAPI", "Julia_times_unadaptive.txt"))
22 | 
23 | oneAPI_times = oneAPI_data[:, 2] .* 1e-3
24 | 
25 | AMDGPU_data = readdlm(joinpath(base_path, "AMDGPU", "Julia_times_unadaptive.txt"))
26 | 
27 | AMDGPU_times = AMDGPU_data[:, 2] .* 1e-3
28 | 
29 | Metal_data = readdlm(joinpath(base_path, "Metal", "Julia_times_unadaptive.txt"))
30 | 
31 | Metal_times = Metal_data[:, 2] .* 1e-3
32 | 
33 | xticks = 10 .^ round.(range(1, 7, length = 10), digits = 2)
34 | 
35 | yticks = 10 .^ round.(range(2, -5, length = 15), digits = 2)
36 | 
37 | s = "Trajectories (" * L"$10^n$" * ")"
38 | 
39 | colors = collect(palette(:default))
40 | 
41 | plt = groupedbar(
42 |     log10.(Ns),
43 |     [CUDA_times oneAPI_times AMDGPU_times Metal_times],
44 |     labels = ["CUDA" "oneAPI" "AMDGPU" "Metal"],
45 |     yaxis = :log,
46 |     yticks = yticks,
47 |     ylabel = "Time (s)",
48 |     xlabel = s,
49 |     legend = :topleft,
50 |     title = "Performance Comparison with different GPU backends",
51 |     titlefontsize = 12,
52 |     palette = [colors[3], colors[1], colors[2], colors[4]],
53 |     dpi = 300,
54 | )
55 | 
56 | plots_dir = joinpath(dirname(dirname(@__DIR__)), "plots")
57 | 
58 | isdir(plots_dir) || mkdir(plots_dir)
59 | 
60 | savefig(plt, joinpath(plots_dir, "Multi_GPU_unadaptive_$(Dates.value(Dates.now())).png"))
61 | 


--------------------------------------------------------------------------------
/run_benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | has_n_option=false
 3 | while getopts l:d:m:n: flag
 4 | do
 5 |     case "${flag}" in
 6 |         l) lang=${OPTARG};;
 7 |         d) dev=${OPTARG};;
 8 |         m) model=${OPTARG};;
 9 |         n) nmax=${OPTARG};has_n_option=true;;
10 |         \?) echo "Unknown option -$OPTARG"; exit 1;;
11 |     esac
12 | done
13 | if $has_n_option; then
14 |     nmax=$nmax
15 | else
16 |     nmax=$((2**24))
17 | fi
18 | echo $lang
19 | if [ $lang == "julia" ]; then
20 |     echo "Benchmarking ${lang^} ${dev^^} accelerated ensemble ${model^^} solvers..."
21 |     if [ $dev == "cpu" ];then
22 |         cmd="./runner_scripts/${dev}/run_${model}_${lang}.sh ${nmax}"
23 |         eval "$cmd"
24 |     elif [ $model == "sde" ];then
25 |         cmd="./runner_scripts/${dev}/run_${model}_${lang}.sh ${nmax}"
26 |         eval "$cmd"
27 |     else
28 |         if [ -d "./data/${lang^}" ];
29 |         then
30 |             rm -f "./data/${lang^}"/*
31 |             mkdir -p "./data/${lang^}"
32 |         else
33 |             mkdir -p "./data/${lang^}"
34 |         fi
35 |         cmd="./runner_scripts/${dev}/run_${model}_${lang}.sh ${nmax}"
36 |         eval "$cmd"
37 |     fi
38 | elif [[ $lang == "jax"  ||  $lang == "pytorch" || $lang == "cpp" ]]; then
39 |     if [[ $model != "ode" || $dev != "gpu" ]]; then
40 |         echo "The benchmarking of ensemble ${model^^} solvers on ${dev^^} with ${lang} is not supported. Please use -m flag with \"ode\" and -d with \"gpu\"."
41 |         exit 1
42 |     else
43 |         echo "Benchmarking ${lang^^} ${dev^^} accelerated ensemble ${model^^} solvers..."
44 |         if [ -d "./data/${lang^^}" ] 
45 |         then
46 |             rm -rf "./data/${lang^^}"/*
47 |             mkdir -p "./data/${lang^^}"
48 |         else
49 |             mkdir -p "./data/${lang^^}"
50 |         fi
51 |         cmd="./runner_scripts/${dev}/run_${model}_${lang}.sh ${nmax}"
52 |         eval "$cmd"
53 |     fi
54 | fi


--------------------------------------------------------------------------------
/runner_scripts/plot/plot_sde_crn.jl:
--------------------------------------------------------------------------------
 1 | using Plots
 2 | using DelimitedFiles
 3 | using Dates
 4 | using Statistics
 5 | using LaTeXStrings
 6 | using StatsPlots
 7 | 
 8 | times = Dict()
 9 | 
10 | parent_dir =
11 |     length(ARGS) != 0 ? joinpath(ARGS[1], "data") :
12 |     joinpath("paper_artifacts", "data", "Julia")
13 | 
14 | base_path = joinpath(dirname(dirname(@__DIR__)), parent_dir)
15 | 
16 | Julia_data = readdlm(joinpath(base_path, "SDE", "CRN", "Julia_times_unadaptive.txt"))
17 | 
18 | GPU_times = Julia_data[:, 2] .* 1e-3
19 | Ns = Julia_data[:, 1]
20 | 
21 | CPU_data = readdlm(joinpath(base_path, "CPU", "SDE", "CRN", "Julia_times_unadaptive.txt"))
22 | 
23 | CPU_times = CPU_data[:, 2] .* 1e-3
24 | 
25 | times["Fixed_CPU"] = mean(CPU_times)
26 | 
27 | times["Fixed_GPU"] = mean(GPU_times)
28 | 
29 | 
30 | xticks = 10 .^ round.(range(1, 7, length = 10), digits = 2)
31 | 
32 | yticks = 10 .^ round.(range(2, -3, length = 11), digits = 2)
33 | 
34 | # plt = plot(
35 | #     Ns,
36 | #     GPU_times,
37 | #     xaxis = :log,
38 | #     yaxis = :log,
39 | #     linewidth = 2,
40 | #     label = "GPU: Float32",
41 | #     ylabel = "Time (s)",
42 | #     xlabel = "Trajectories",
43 | #     title = "Lorenz Problem: 1000 fixed time-steps",
44 | #     legend = :topleft,
45 | #     xticks = xticks,
46 | #     yticks = yticks,
47 | #     marker = :circle,
48 | # )
49 | 
50 | 
51 | 
52 | plt = groupedbar(
53 |     log10.(Ns),
54 |     [GPU_times CPU_times],
55 |     labels = ["GPU" "CPU"],
56 |     yaxis = :log,
57 |     yticks = yticks,
58 |     ylabel = "Time (s)",
59 |     xlabel = "Trajectories (" * L"$10^n$" * ")",
60 |     legend = :topleft,
61 |     title = "Performance Comparison of parallel-parameter \n sweeps in SDEs between CPU and GPU",
62 |     dpi = 600,
63 | )
64 | 
65 | 
66 | plots_dir = joinpath(dirname(dirname(@__DIR__)), "plots")
67 | 
68 | isdir(plots_dir) || mkdir(plots_dir)
69 | 
70 | savefig(plt, joinpath(plots_dir, "CPU_SDE_CRN_$(Dates.value(Dates.now())).png"))
71 | 


--------------------------------------------------------------------------------
/GPU_ODE_Julia/bench_cpu.jl:
--------------------------------------------------------------------------------
 1 | using BenchmarkTools, StaticArrays, OrdinaryDiffEq
 2 | 
 3 | @show ARGS
 4 | #settings
 5 | 
 6 | numberOfParameters = isinteractive() ? 8192 : parse(Int64, ARGS[1])
 7 | 
 8 | function lorenz(u, p, t)
 9 |     du1 = 10.0 * (u[2] - u[1])
10 |     du2 = p[1] * u[1] - u[2] - u[1] * u[3]
11 |     du3 = u[1] * u[2] - 2.666 * u[3]
12 |     return @SVector [du1, du2, du3]
13 | end
14 | 
15 | u0 = @SVector [1.0; 0.0; 0.0]
16 | tspan = (0.0, 1.0)
17 | p = @SArray [21.0]
18 | prob = ODEProblem(lorenz, u0, tspan, p)
19 | 
20 | parameterList = range(0.0, stop = 21.0, length = numberOfParameters)
21 | 
22 | lorenzProblem = ODEProblem(lorenz, u0, tspan, p)
23 | prob_func = (prob, i, repeat) -> remake(prob, p = @SArray [parameterList[i]])
24 | 
25 | ensembleProb = EnsembleProblem(lorenzProblem, prob_func = prob_func)
26 | 
27 | @info "Solving the problem"
28 | data = @benchmark solve($ensembleProb, Tsit5(), EnsembleThreads(), dt = 0.001,
29 |                         adaptive = false, save_everystep = false,
30 |                         trajectories = numberOfParameters)
31 | 
32 | if !isinteractive()
33 |     open(joinpath(dirname(@__DIR__), "data", "CPU", "times_unadaptive.txt"), "a+") do io
34 |         println(io, numberOfParameters, " ", minimum(data.times) / 1e6)
35 |     end
36 | end
37 | 
38 | println("Parameter number: " * string(numberOfParameters))
39 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms")
40 | println("Allocs: " * string(data.allocs))
41 | 
42 | data = @benchmark solve($ensembleProb, Tsit5(), EnsembleThreads(), dt = 0.001,
43 |                         adaptive = true, save_everystep = false,
44 |                         trajectories = numberOfParameters)
45 | 
46 | if !isinteractive()
47 |     open(joinpath(dirname(@__DIR__), "data", "CPU", "times_adaptive.txt"), "a+") do io
48 |         println(io, numberOfParameters, " ", minimum(data.times) / 1e6)
49 |     end
50 | end
51 | 
52 | println("Parameter number: " * string(numberOfParameters))
53 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms")
54 | println("Allocs: " * string(data.allocs))
55 | 


--------------------------------------------------------------------------------
/GPU_ODE_MPGOS/Lorenz_SystemDefinition.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef T2_PERTHREAD_SYSTEMDEFINITION_H
 2 | #define T2_PERTHREAD_SYSTEMDEFINITION_H
 3 | 
 4 | // SYSTEM
 5 | template <class Precision>
 6 | __forceinline__ __device__ void PerThread_OdeFunction(\
 7 | 			int tid, int NT, \
 8 | 			Precision*    F, Precision*    X, Precision     T, \
 9 | 			Precision* cPAR, Precision* sPAR, int*      sPARi, Precision* ACC, int* ACCi)
10 | {
11 | 	F[0] = 10.0*( X[1]-X[0] );
12 | 	F[1] = cPAR[0]*X[0] - X[1] - X[0]*X[2];
13 | 	F[2] = X[0]*X[1] - 2.666 * X[2];
14 | }
15 | 
16 | // EVENTS
17 | template <class Precision>
18 | __forceinline__ __device__ void PerThread_EventFunction(\
19 | 			int tid, int NT, Precision* EF, \
20 | 			Precision     T, Precision    dT, Precision*    TD, Precision*   X, \
21 | 			Precision* cPAR, Precision* sPAR,       int* sPARi, Precision* ACC, int* ACCi)
22 | {	
23 | 	
24 | }
25 | 
26 | template <class Precision>
27 | __forceinline__ __device__ void PerThread_ActionAfterEventDetection(\
28 | 			int tid, int NT, int IDX, int& UDT, \
29 | 			Precision    &T, Precision   &dT, Precision*    TD, Precision*   X, \
30 | 			Precision* cPAR, Precision* sPAR, int*       sPARi, Precision* ACC, int* ACCi)
31 | {	
32 | 	
33 | }
34 | 
35 | // ACCESSORIES
36 | template <class Precision>
37 | __forceinline__ __device__ void PerThread_ActionAfterSuccessfulTimeStep(\
38 | 			int tid, int NT, int& UDT, \
39 | 			Precision&    T, Precision&   dT, Precision*    TD, Precision*   X, \
40 | 			Precision* cPAR, Precision* sPAR, int*       sPARi, Precision* ACC, int* ACCi)
41 | {
42 | 	
43 | }
44 | 
45 | template <class Precision>
46 | __forceinline__ __device__ void PerThread_Initialization(\
47 | 			int tid, int NT, int& DOIDX, \
48 | 			Precision&    T, Precision&   dT, Precision*    TD, Precision*   X, \
49 | 			Precision* cPAR, Precision* sPAR,       int* sPARi, Precision* ACC, int* ACCi)
50 | {
51 | 	
52 | }
53 | 
54 | template <class Precision>
55 | __forceinline__ __device__ void PerThread_Finalization(\
56 | 			int tid, int NT, int& DOIDX, \
57 | 			Precision&    T, Precision&   dT, Precision*    TD, Precision*   X, \
58 | 			Precision* cPAR, Precision* sPAR,       int* sPARi, Precision* ACC, int* ACCi)
59 | {
60 | 	
61 | }
62 | 
63 | #endif


--------------------------------------------------------------------------------
/GPU_ODE_MPGOS/SourceCodes/SingleSystem_PerThread_DenseOutput.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef SINGLESYSTEM_PERTHREAD_DENSEOUTPUT_H
 2 | #define SINGLESYSTEM_PERTHREAD_DENSEOUTPUT_H
 3 | 
 4 | 
 5 | template <int NT, int SD, int NDO, class Precision>
 6 | __forceinline__ __device__ void PerThread_StoreDenseOutput(\
 7 | 			int        tid, \
 8 | 			int        r_UpdateDenseOutput, \
 9 | 			int&       r_DenseOutputIndex, \
10 | 			Precision* d_DenseOutputTimeInstances, \
11 | 			Precision  r_ActualTime, \
12 | 			Precision* d_DenseOutputStates, \
13 | 			Precision* r_ActualState, \
14 | 			int&       r_NumberOfSkippedStores, \
15 | 			Precision& r_DenseOutputActualTime, \
16 | 			Precision  DenseOutputMinimumTimeStep, \
17 | 			Precision  UpperTimeDomain)
18 | {
19 | 	if ( r_UpdateDenseOutput == 1 )
20 | 	{
21 | 		d_DenseOutputTimeInstances[tid + r_DenseOutputIndex*NT] = r_ActualTime;
22 | 		
23 | 		int DenseOutputStateIndex = tid + r_DenseOutputIndex*NT*SD;
24 | 		for (int i=0; i<SD; i++)
25 | 		{
26 | 			d_DenseOutputStates[DenseOutputStateIndex] = r_ActualState[i];
27 | 			DenseOutputStateIndex += NT;
28 | 		}
29 | 		
30 | 		r_DenseOutputIndex++;
31 | 		r_NumberOfSkippedStores = 0;
32 | 		r_DenseOutputActualTime = MPGOS::FMIN(r_ActualTime+DenseOutputMinimumTimeStep, UpperTimeDomain);
33 | 	}
34 | 	
35 | 	if ( r_UpdateDenseOutput == 0 )
36 | 		r_NumberOfSkippedStores++;
37 | }
38 | 
39 | 
40 | template <int NDO, class Precision>
41 | __forceinline__ __device__ void PerThread_DenseOutputStorageCondition(\
42 | 			Precision r_ActualTime, \
43 | 			Precision r_DenseOutputActualTime, \
44 | 			int       r_DenseOutputIndex, \
45 | 			int       r_NumberOfSkippedStores, \
46 | 			int       r_EndTimeDomainReached, \
47 | 			int       r_UserDefinedTermination, \
48 | 			int&      r_UpdateDenseOutput, \
49 | 			Struct_SolverOptions<Precision> SolverOptions)
50 | {
51 | 	if ( ( r_DenseOutputIndex < NDO ) && ( r_DenseOutputActualTime < r_ActualTime ) && ( r_NumberOfSkippedStores >= (SolverOptions.DenseOutputSaveFrequency-1) ) )
52 | 		r_UpdateDenseOutput = 1;
53 | 	else
54 | 		r_UpdateDenseOutput = 0;
55 | 	
56 | 	if ( ( r_DenseOutputIndex < NDO ) && ( ( r_EndTimeDomainReached == 1 ) || ( r_UserDefinedTermination == 1 ) ) )
57 | 		r_UpdateDenseOutput = 1;
58 | }
59 | 
60 | #endif


--------------------------------------------------------------------------------
/GPU_ODE_MPGOS/SourceCodes/MPGOS_Overloaded_MathFunction.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef MPGOS_OVERLOADED_MATHFUNCTIONS_H
 2 | #define MPGOS_OVERLOADED_MATHFUNCTIONS_H
 3 | 
 4 | namespace MPGOS
 5 | {
 6 | 	// Floating point absolute value
 7 | 	__forceinline__ __device__ float FABS(float a)
 8 | 	{
 9 | 		return fabsf(a);
10 | 	}
11 | 	
12 | 	__forceinline__ __device__ double FABS(double a)
13 | 	{
14 | 		return fabs(a);
15 | 	}
16 | 	
17 | 	
18 | 	// Floating point maximum -------------------------------------------------
19 | 	__forceinline__ __device__ float FMAX(float a, float b)
20 | 	{
21 | 		return fmaxf(a, b);
22 | 	}
23 | 	
24 | 	__forceinline__ __device__ double FMAX(double a, double b)
25 | 	{
26 | 		return fmax(a, b);
27 | 	}
28 | 	
29 | 	
30 | 	// Floating point minimum -------------------------------------------------
31 | 	__forceinline__ __device__ float FMIN(float a, float b)
32 | 	{
33 | 		return fminf(a, b);
34 | 	}
35 | 	
36 | 	__forceinline__ __device__ double FMIN(double a, double b)
37 | 	{
38 | 		return fmin(a, b);
39 | 	}
40 | 	
41 | 	// Floating point atomic minimum ------------------------------------------
42 | 	__forceinline__ __device__ float atomicFMIN(float* address, float val)
43 | 	{
44 | 		int ret = __float_as_int(*address);
45 | 		while ( val < __int_as_float(ret) )
46 | 		{
47 | 			int old = ret;
48 | 			if ( ( ret = atomicCAS((int *)address, old, __float_as_int(val)) ) == old )
49 | 				break;
50 | 		}
51 | 		return __int_as_float(ret);
52 | 	}
53 | 	
54 | 	__forceinline__ __device__ double atomicFMIN(double *address, double val)
55 | 	{
56 | 		unsigned long long ret = __double_as_longlong(*address);
57 | 		while ( val < __longlong_as_double(ret) )
58 | 		{
59 | 			unsigned long long old = ret;
60 | 			if ( ( ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val)) ) == old )
61 | 				break;
62 | 		}
63 | 		return __longlong_as_double(ret);
64 | 	}
65 | 	
66 | 	// Floating point atomic maximum ------------------------------------------
67 | 	__forceinline__ __device__ float atomicFMAX(float *address, float val)
68 | 	{
69 | 		int ret = __float_as_int(*address);
70 | 		while ( val > __int_as_float(ret) )
71 | 		{
72 | 			int old = ret;
73 | 			if ( (ret = atomicCAS((int *)address, old, __float_as_int(val)) ) == old )
74 | 				break;
75 | 		}
76 | 		return __int_as_float(ret);
77 | 	}
78 | 	
79 | 	__forceinline__ __device__ double atomicFMAX(double *address, double val)
80 | 	{
81 | 		unsigned long long ret = __double_as_longlong(*address);
82 | 		while ( val > __longlong_as_double(ret) )
83 | 		{
84 | 			unsigned long long old = ret;
85 | 			if ( (ret = atomicCAS((unsigned long long *)address, old, __double_as_longlong(val)) ) == old )
86 | 				break;
87 | 		}
88 | 		return __longlong_as_double(ret);
89 | 	}
90 | }
91 | 
92 | #endif


--------------------------------------------------------------------------------
/GPU_ODE_PyTorch/bench_torchdiffeq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | # %%
  4 | # Benchmarking torchdiffeq ODE solvers for ensemble problems, via vmap. The Lorenz ODE is integrated by Tsit5.
  5 | 
  6 | # Created By: Utkarsh
  7 | # Last Updated: 19 April 2023
  8 | 
  9 | # %%
 10 | 
 11 | import torch
 12 | import sys
 13 | import os
 14 | import timeit
 15 | import sys
 16 | 
 17 | numberOfParameters = int(sys.argv[1])
 18 | 
 19 | # %%
 20 | 
 21 | 
 22 | import torchdiffeq
 23 | import math
 24 | import torch.nn as nn
 25 | import timeit
 26 | from torchdiffeq import odeint
 27 | 
 28 | 
 29 | # %%
 30 | 
 31 | ## Checking if torch installation has cuda enabled
 32 | print("CUDA enabled: ", torch.has_cuda)
 33 | 
 34 | 
 35 | # %%
 36 | # Defining the Lorenz ODE problem
 37 | class LorenzODE(torch.nn.Module):
 38 | 
 39 |     def __init__(self, rho = torch.tensor(21.0)):
 40 |         super(LorenzODE, self).__init__()
 41 |         self.sigma = nn.Parameter(torch.as_tensor([10.0]))
 42 |         self.rho = nn.Parameter(rho)
 43 |         self.beta = nn.Parameter(torch.as_tensor([8/3]))
 44 | 
 45 |     def forward(self, t, u):
 46 |         x, y, z = u[0],u[1],u[2]
 47 |         du1 = self.sigma[0] * (y - x)
 48 |         du2 = x * (self.rho - z) - y
 49 |         du3 = x * y - self.beta[0] * z
 50 |         return torch.stack([du1, du2, du3])
 51 | 
 52 | 
 53 | # %%
 54 | # Uncomment for smoke test
 55 | 
 56 | # u0 = torch.tensor([1.0,0.0,0.0]).cuda()
 57 | # t = torch.linspace(0, 1.0, 1001).cuda()
 58 | # y = odeint(LorenzODE(), u0, t, method='rk4',options=dict(step_size=0.001))
 59 | 
 60 | 
 61 | # %%
 62 | # Define the solve without gradient calculations
 63 | # Note: I was't able to JIT compile the code with this application, torchdiffeq + vmap
 64 | def solve(p):
 65 |     with torch.no_grad():
 66 |         traj = odeint(LorenzODE(rho = p), u0, t, method='rk4', options=dict(step_size=0.001))
 67 |         return traj
 68 | 
 69 | # Define the initial conditions and timepoints to save
 70 | u0 = torch.tensor([1.0,0.0,0.0]).cuda()
 71 | t = torch.linspace(0, 1.0, 2).cuda()
 72 | 
 73 | 
 74 | # %%
 75 | # Generate parameter list
 76 | parameters = torch.linspace(0.0,21.0,numberOfParameters).cuda()
 77 | 
 78 | 
 79 | # %%
 80 | 
 81 | import timeit
 82 | res = timeit.repeat(lambda: torch.vmap(solve)(parameters), repeat = 10, number = 1)
 83 | 
 84 | 
 85 | # %%
 86 | # Print the best result
 87 | 
 88 | best_time  = min(res)*1000
 89 | print("{:} ODE solves with fixed time-stepping completed in {:.1f} ms".format(numberOfParameters, best_time))
 90 | 
 91 | 
 92 | # %%
 93 | # Save the result
 94 | 
 95 | file = open("./data/PYTORCH/Torch_times_unadaptive.txt","a+")
 96 | file.write('{0} {1}\n'.format(numberOfParameters, best_time))
 97 | file.close()
 98 | 
 99 | 
100 | # %%
101 | 


--------------------------------------------------------------------------------
/MPI/gpu_ode_mpi.jl:
--------------------------------------------------------------------------------
  1 | """
  2 | Scaling GPU ODE solvers to mulitple GPU cluster nodes with MPI.
  3 | 
  4 | Created by: Utkarsh
  5 | Last Modified: 20 April 2023
  6 | """
  7 | 
  8 | using MPI
  9 | using CUDA
 10 | using DiffEqGPU, StaticArrays, CUDA, DiffEqBase
 11 | using BenchmarkTools
 12 | 
 13 | function split_count(N::Integer, n::Integer)
 14 |     q, r = divrem(N, n)
 15 |     return [i <= r ? q + 1 : q for i = 1:n]
 16 | end
 17 | 
 18 | 
 19 | MPI.Init()
 20 | 
 21 | comm = MPI.COMM_WORLD
 22 | rank = MPI.Comm_rank(comm)
 23 | comm_size = MPI.Comm_size(comm)
 24 | 
 25 | root = 0
 26 | 
 27 | function lorenz(u, p, t)
 28 |     σ = p[1]
 29 |     ρ = p[2]
 30 |     β = p[3]
 31 |     du1 = σ * (u[2] - u[1])
 32 |     du2 = u[1] * (ρ - u[3]) - u[2]
 33 |     du3 = u[1] * u[2] - β * u[3]
 34 |     return SVector{3}(du1, du2, du3)
 35 | end
 36 | 
 37 | u0 = @SVector [1.0f0; 0.0f0; 0.0f0]
 38 | tspan = (0.0f0, 10.0f0)
 39 | p = @SVector [10.0f0, 28.0f0, 8 / 3.0f0]
 40 | prob = ODEProblem{false}(lorenz, u0, tspan, p)
 41 | 
 42 | function perform_ode_solve(prob, parameter)
 43 |     trajectories = length(parameter)
 44 |     probs = map(1:trajectories) do i
 45 |         remake(prob, p = @SVector [10.0f0, parameter[i], 8 / 3.0f0])
 46 |     end
 47 | 
 48 |     ## Move the arrays to the GPU
 49 |     probs = cu(probs)
 50 | 
 51 |     ts, us = DiffEqGPU.vectorized_asolve(
 52 |         probs,
 53 |         prob,
 54 |         GPUTsit5();
 55 |         saveat = [prob.tspan[2]],
 56 |         dt = 0.1f0,
 57 |     )
 58 | end
 59 | 
 60 | if rank == root
 61 |     M, N = 1, 2^30
 62 | 
 63 |     test = collect(LinRange(0.0f0, 21.0f0, N))
 64 |     output = CuArray{typeof(u0)}(undef, (1, N))
 65 | 
 66 |     N_counts = split_count(N, comm_size - 1)
 67 | 
 68 |     sizes = pushfirst!(N_counts, 0)
 69 |     size_ubuf = UBuffer(sizes, 1)
 70 | 
 71 |     counts = sizes
 72 | 
 73 |     test_vbuf = VBuffer(test, counts) # VBuffer for scatter
 74 |     output_vbuf = VBuffer(output, counts) # VBuffer for gather
 75 | else
 76 |     # these variables can be set to `nothing` on non-root processes
 77 |     size_ubuf = UBuffer(nothing)
 78 |     output_vbuf = test_vbuf = VBuffer(nothing)
 79 | end
 80 | 
 81 | MPI.Barrier(comm)
 82 | 
 83 | local_size = MPI.Scatter(size_ubuf, NTuple{1,Int}, root, comm)
 84 | local_test = MPI.Scatterv!(test_vbuf, zeros(Float32, local_size), root, comm)
 85 | 
 86 | if rank != root
 87 |     ts, us = perform_ode_solve(prob, local_test)
 88 | else
 89 |     us = CuArray{typeof(u0)}(undef, (1, 0))
 90 | end
 91 | 
 92 | MPI.Barrier(comm)
 93 | 
 94 | @show MPI.Get_processor_name(), size(us)
 95 | 
 96 | MPI.Gatherv!(us, output_vbuf, root, comm)
 97 | 
 98 | MPI.Barrier(comm)
 99 | 
100 | if rank == root
101 |     println()
102 |     println("Final matrix")
103 |     println("================")
104 |     @show size(output)
105 | end
106 | 


--------------------------------------------------------------------------------
/GPU_ODE_Julia/bench_lorenz_gpu.jl:
--------------------------------------------------------------------------------
 1 | using Pkg
 2 | 
 3 | Pkg.instantiate()
 4 | Pkg.precompile()
 5 | using DiffEqGPU, BenchmarkTools, StaticArrays, SimpleDiffEq
 6 | using CUDA
 7 | 
 8 | @show ARGS
 9 | #settings
10 | 
11 | numberOfParameters = isinteractive() ? 8192 : parse(Int64, ARGS[1])
12 | 
13 | function lorenz(u, p, t)
14 |     du1 = 10.0f0 * (u[2] - u[1])
15 |     du2 = p[1] * u[1] - u[2] - u[1] * u[3]
16 |     du3 = u[1] * u[2] - 2.666f0 * u[3]
17 |     return @SVector [du1, du2, du3]
18 | end
19 | 
20 | u0 = @SVector [1.0f0; 0.0f0; 0.0f0]
21 | tspan = (0.0f0, 1.0f0)
22 | p = @SArray [21.0f0]
23 | prob = ODEProblem(lorenz, u0, tspan, p)
24 | 
25 | parameterList = range(0.0f0, stop = 21.0f0, length = numberOfParameters)
26 | 
27 | lorenzProblem = ODEProblem(lorenz, u0, tspan, p)
28 | prob_func = (prob, i, repeat) -> remake(prob, p = @SArray [parameterList[i]])
29 | 
30 | ensembleProb = EnsembleProblem(lorenzProblem, prob_func = prob_func)
31 | 
32 | ## Building problems here only
33 | I = 1:numberOfParameters
34 | if ensembleProb.safetycopy
35 |     probs = map(I) do i
36 |         ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1)
37 |     end
38 | else
39 |     probs = map(I) do i
40 |         ensembleProb.prob_func(ensembleProb.prob, i, 1)
41 |     end
42 | end
43 | 
44 | ## Make them compatible with CUDA
45 | probs = cu(probs)
46 | 
47 | @info "Solving the problem"
48 | data = @benchmark CUDA.@sync DiffEqGPU.vectorized_solve($probs, $ensembleProb.prob,
49 |                                                         GPUTsit5();
50 |                                                         save_everystep = false,
51 |                                                         dt = 0.001f0)
52 | 
53 | if !isinteractive()
54 |     open(joinpath(dirname(@__DIR__), "data", "Julia", "Julia_times_unadaptive.txt"),
55 |          "a+") do io
56 |         println(io, numberOfParameters, " ", minimum(data.times) / 1e6)
57 |     end
58 | end
59 | 
60 | println("Parameter number: " * string(numberOfParameters))
61 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms")
62 | println("Allocs: " * string(data.allocs))
63 | 
64 | data = @benchmark CUDA.@sync DiffEqGPU.vectorized_asolve($probs, $ensembleProb.prob,
65 |                                                          GPUTsit5();
66 |                                                          dt = 0.001f0, reltol = 1.0f-8,
67 |                                                          abstol = 1.0f-8)
68 | 
69 | if !isinteractive()
70 |     open(joinpath(dirname(@__DIR__), "data", "Julia", "Julia_times_adaptive.txt"),
71 |          "a+") do io
72 |         println(io, numberOfParameters, " ", minimum(data.times) / 1e6)
73 |     end
74 | end
75 | 
76 | println("Parameter number: " * string(numberOfParameters))
77 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms")
78 | println("Allocs: " * string(data.allocs))
79 | 


--------------------------------------------------------------------------------
/GPU_ODE_JAX/environment.yml:
--------------------------------------------------------------------------------
 1 | name: venv_jax
 2 | channels:
 3 |   - nvidia/label/cuda-11.8.0
 4 |   - conda-forge
 5 |   - defaults
 6 |   - anaconda
 7 | dependencies:
 8 |   - _libgcc_mutex=0.1=conda_forge
 9 |   - _openmp_mutex=4.5=2_gnu
10 |   - asttokens=2.2.1=pyhd8ed1ab_0
11 |   - backcall=0.2.0=pyh9f0ad1d_0
12 |   - backports=1.0=pyhd8ed1ab_3
13 |   - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
14 |   - bzip2=1.0.8=h7f98852_4
15 |   - ca-certificates=2022.12.7=ha878542_0
16 |   - comm=0.1.2=pyhd8ed1ab_0
17 |   - cuda-nvcc=11.8.89=0
18 |   - cudatoolkit=11.8.0=h37601d7_11
19 |   - cudnn=8.4.1.50=hed8a83a_0
20 |   - debugpy=1.6.6=py39h227be39_0
21 |   - decorator=5.1.1=pyhd8ed1ab_0
22 |   - executing=1.2.0=pyhd8ed1ab_0
23 |   - importlib-metadata=6.1.0=pyha770c72_0
24 |   - importlib_metadata=6.1.0=hd8ed1ab_0
25 |   - ipykernel=6.19.2=py39hb070fc8_0
26 |   - ipython=8.11.0=pyh41d4057_0
27 |   - jedi=0.18.2=pyhd8ed1ab_0
28 |   - jupyter_client=8.1.0=pyhd8ed1ab_0
29 |   - jupyter_core=5.3.0=py39hf3d152e_0
30 |   - ld_impl_linux-64=2.40=h41732ed_0
31 |   - libffi=3.4.2=h7f98852_5
32 |   - libgcc-ng=12.2.0=h65d4601_19
33 |   - libgomp=12.2.0=h65d4601_19
34 |   - libnsl=2.0.0=h7f98852_0
35 |   - libsodium=1.0.18=h36c2ea0_1
36 |   - libsqlite=3.40.0=h753d276_0
37 |   - libstdcxx-ng=12.2.0=h46fd767_19
38 |   - libuuid=2.32.1=h7f98852_1000
39 |   - libzlib=1.2.13=h166bdaf_4
40 |   - matplotlib-inline=0.1.6=pyhd8ed1ab_0
41 |   - ncurses=6.3=h27087fc_1
42 |   - nest-asyncio=1.5.6=pyhd8ed1ab_0
43 |   - openssl=3.1.0=h0b41bf4_0
44 |   - packaging=23.0=pyhd8ed1ab_0
45 |   - parso=0.8.3=pyhd8ed1ab_0
46 |   - pexpect=4.8.0=pyh1a96a4e_2
47 |   - pickleshare=0.7.5=py_1003
48 |   - pip=23.0.1=pyhd8ed1ab_0
49 |   - platformdirs=3.1.1=pyhd8ed1ab_0
50 |   - prompt-toolkit=3.0.38=pyha770c72_0
51 |   - prompt_toolkit=3.0.38=hd8ed1ab_0
52 |   - psutil=5.9.4=py39hb9d737c_0
53 |   - ptyprocess=0.7.0=pyhd3deb0d_0
54 |   - pure_eval=0.2.2=pyhd8ed1ab_0
55 |   - pygments=2.14.0=pyhd8ed1ab_0
56 |   - python=3.9.16=h2782a2a_0_cpython
57 |   - python-dateutil=2.8.2=pyhd8ed1ab_0
58 |   - python_abi=3.9=3_cp39
59 |   - pyzmq=25.0.2=py39h0be026e_0
60 |   - readline=8.1.2=h0f457ee_0
61 |   - setuptools=67.6.0=pyhd8ed1ab_0
62 |   - six=1.16.0=pyh6c4a22f_0
63 |   - stack_data=0.6.2=pyhd8ed1ab_0
64 |   - tk=8.6.12=h27826a3_0
65 |   - tornado=6.2=py39hb9d737c_1
66 |   - traitlets=5.9.0=pyhd8ed1ab_0
67 |   - typing-extensions=4.5.0=hd8ed1ab_0
68 |   - typing_extensions=4.5.0=pyha770c72_0
69 |   - tzdata=2022g=h191b570_0
70 |   - wcwidth=0.2.6=pyhd8ed1ab_0
71 |   - wheel=0.40.0=pyhd8ed1ab_0
72 |   - xz=5.2.6=h166bdaf_0
73 |   - zeromq=4.3.4=h9c3ff4c_1
74 |   - zipp=3.15.0=pyhd8ed1ab_0
75 |   - pip:
76 |       - diffrax==0.3.1
77 |       - equinox==0.10.1
78 |       - jax==0.4.6
79 |       - jaxlib==0.4.6+cuda11.cudnn82
80 |       - jaxtyping==0.2.14
81 |       - numpy==1.24.2
82 |       - opt-einsum==3.3.0
83 |       - scipy==1.10.1
84 |       - typeguard==3.0.1


--------------------------------------------------------------------------------
/runner_scripts/plot/plot_ode_comp.jl:
--------------------------------------------------------------------------------
  1 | using Plots
  2 | using DelimitedFiles
  3 | using Dates
  4 | using Statistics
  5 | 
  6 | using Plots.PlotMeasures
  7 | 
  8 | 
  9 | parent_dir =
 10 |     length(ARGS) != 0 ? joinpath(ARGS[1], "data") :
 11 |     joinpath("paper_artifacts", "data", "RTX_5000")
 12 | 
 13 | base_path = joinpath(dirname(dirname(@__DIR__)), parent_dir)
 14 | 
 15 | times_v100 = Dict()
 16 | 
 17 | Julia_data = readdlm(joinpath(base_path, "Julia", "Julia_times_unadaptive.txt"))
 18 | 
 19 | Julia_times = Julia_data[:, 2] .* 1e-3
 20 | Ns = Julia_data[:, 1]
 21 | 
 22 | MPGOS_data = readdlm(joinpath(base_path, "MPGOS", "MPGOS_times_unadaptive.txt"))
 23 | 
 24 | MPGOS_times = MPGOS_data[:, 2] .* 1e-3
 25 | 
 26 | JAX_data = readdlm(joinpath(base_path, "JAX", "Jax_times_unadaptive.txt"))
 27 | 
 28 | JAX_times = JAX_data[:, 2] .* 1e-3
 29 | 
 30 | Torch_data = readdlm(joinpath(base_path, "PyTorch", "Torch_times_unadaptive.txt"))
 31 | 
 32 | Torch_times = Torch_data[:, 2] .* 1e-3
 33 | 
 34 | times_v100["Fixed_Julia"] =
 35 |     (minimum(Julia_times ./ Julia_times), maximum(Julia_times ./ Julia_times))
 36 | 
 37 | times_v100["Fixed_JAX"] =
 38 |     (minimum(JAX_times ./ Julia_times), maximum(JAX_times ./ Julia_times))
 39 | 
 40 | times_v100["Fixed_MPGOS"] =
 41 |     (minimum(MPGOS_times ./ Julia_times), maximum(MPGOS_times ./ Julia_times))
 42 | 
 43 | times_v100["Fixed_Torch"] =
 44 |     (minimum(Torch_times ./ Julia_times), maximum(Torch_times ./ Julia_times))
 45 | 
 46 | xticks = 10 .^ round.(range(1, 7, length = 13), digits = 2)
 47 | 
 48 | yticks = 10 .^ round.(range(2, -5, length = 15), digits = 2)
 49 | gr(size = (810, 540))
 50 | plt = plot(
 51 |     Ns,
 52 |     Julia_times,
 53 |     xaxis = :log,
 54 |     yaxis = :log,
 55 |     linewidth = 2,
 56 |     label = "Julia",
 57 |     ylabel = "Time (s)",
 58 |     xlabel = "Trajectories",
 59 |     title = "Lorenz Problem: 1000 fixed time-steps",
 60 |     legend = :topleft,
 61 |     xticks = xticks,
 62 |     yticks = yticks,
 63 |     color = :Green,
 64 |     marker = :circle,
 65 |     dpi = 600,
 66 |     # left_margin = mm, bottom_margin = 4mm,top_margin = 6mm,right_margin = 6mm
 67 | )
 68 | 
 69 | plt = plot!(
 70 |     Ns,
 71 |     MPGOS_times,
 72 |     xaxis = :log,
 73 |     yaxis = :log,
 74 |     linewidth = 2,
 75 |     label = "MPGOS",
 76 |     color = :Orange,
 77 |     marker = :circle,
 78 | )
 79 | 
 80 | plt = plot!(
 81 |     Ns,
 82 |     JAX_times,
 83 |     xaxis = :log,
 84 |     yaxis = :log,
 85 |     linewidth = 2,
 86 |     label = "JAX",
 87 |     color = :Red,
 88 |     marker = :circle,
 89 | )
 90 | 
 91 | plt = plot!(
 92 |     Ns,
 93 |     Torch_times,
 94 |     xaxis = :log,
 95 |     yaxis = :log,
 96 |     linewidth = 2,
 97 |     label = "PyTorch",
 98 |     color = :DarkRed,
 99 |     marker = :circle,
100 | )
101 | 
102 | plots_dir = joinpath(dirname(dirname(@__DIR__)), "plots")
103 | 
104 | isdir(plots_dir) || mkdir(plots_dir)
105 | 
106 | 
107 | savefig(plt, joinpath(plots_dir, "Lorenz_unadaptive_$(Dates.value(Dates.now())).png"))
108 | 


--------------------------------------------------------------------------------
/GPU_ODE_Julia/bench_ensemblegpuarray.jl:
--------------------------------------------------------------------------------
 1 | """
 2 | Benchmarking of the Julia's EnsembleGPUArray GPU acceleration. The implementation is similar
 3 | to the vectorized map approach. The timings are stored in ./data folder, with Julia_EnGPUArray_times
 4 | ".txt" file.
 5 | 
 6 | Created by: Utkarsh
 7 | Last Updated: 18 April 2023
 8 | """
 9 | 
10 | using DiffEqGPU, BenchmarkTools, StaticArrays, OrdinaryDiffEq
11 | using CUDA
12 | 
13 | @show ARGS
14 | #settings
15 | 
16 | numberOfParameters = isinteractive() ? 8388608 : parse(Int64, ARGS[1])
17 | 
18 | function lorenz(u, p, t)
19 |     du1 = 10.0f0 * (u[2] - u[1])
20 |     du2 = p[1] * u[1] - u[2] - u[1] * u[3]
21 |     du3 = u[1] * u[2] - 2.666f0 * u[3]
22 |     return @SVector [du1, du2, du3]
23 | end
24 | 
25 | u0 = @SVector [1.0f0; 0.0f0; 0.0f0]
26 | tspan = (0.0f0, 1.0f0)
27 | p = @SArray [21.0f0]
28 | prob = ODEProblem(lorenz, u0, tspan, p)
29 | 
30 | ## parameter list uniformly varying the single lorenz parameter
31 | parameterList = range(0.0f0, stop = 21.0f0, length = numberOfParameters)
32 | 
33 | lorenzProblem = ODEProblem(lorenz, u0, tspan, p)
34 | prob_func = (prob, i, repeat) -> remake(prob, p = @SArray [parameterList[i]])
35 | 
36 | ensembleProb = EnsembleProblem(lorenzProblem, prob_func = prob_func)
37 | 
38 | batch = 1:numberOfParameters
39 | if ensembleProb.safetycopy
40 |     probs = map(batch) do i
41 |         ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1)
42 |     end
43 | else
44 |     probs = map(batch) do i
45 |         ensembleProb.prob_func(ensembleProb.prob, i, 1)
46 |     end
47 | end
48 | 
49 | @info "Solving the problem"
50 | data = @benchmark CUDA.@sync DiffEqGPU.vectorized_map_solve($probs, RK4(),
51 |                                                             EnsembleGPUArray(0.0), $batch,
52 |                                                             false, dt = 0.001f0,
53 |                                                             save_everystep = false,
54 |                                                             dense = false)
55 | 
56 | if !isinteractive()
57 |     open(joinpath(dirname(@__DIR__), "data", "EnsembleGPUArray",
58 |                   "Julia_EnGPUArray_times_unadaptive.txt"), "a+") do io
59 |         println(io, numberOfParameters, " ", minimum(data.times) / 1e6)
60 |     end
61 | end
62 | 
63 | println("Parameter number: " * string(numberOfParameters))
64 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms")
65 | println("Allocs: " * string(data.allocs))
66 | 
67 | data = @benchmark CUDA.@sync DiffEqGPU.vectorized_map_solve($probs, Tsit5(),
68 |                                                             EnsembleGPUArray(0.0), $batch,
69 |                                                             true, dt = 0.001f0,
70 |                                                             save_everystep = false,
71 |                                                             dense = false)
72 | 
73 | if !isinteractive()
74 |     open(joinpath(dirname(@__DIR__), "data", "EnsembleGPUArray",
75 |                   "Julia_EnGPUArray_times_adaptive.txt"), "a+") do io
76 |         println(io, numberOfParameters, " ", minimum(data.times) / 1e6)
77 |     end
78 | end
79 | 
80 | println("Parameter number: " * string(numberOfParameters))
81 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms")
82 | println("Allocs: " * string(data.allocs))
83 | 


--------------------------------------------------------------------------------
/GPU_ODE_MPGOS/SourceCodes/SingleSystem_PerThread_ExplicitRungeKutta_ErrorControllers.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef SINGLESYSTEM_PERTHREAD_EXPLICITRUNGEKUTTA_ERRORCONTROLLERS_H
 2 | #define SINGLESYSTEM_PERTHREAD_EXPLICITRUNGEKUTTA_ERRORCONTROLLERS_H
 3 | 
 4 | 
 5 | template <class Precision>
 6 | __forceinline__ __device__ void PerThread_ErrorController_RK4(\
 7 | 			int        tid, \
 8 | 			Precision  InitialTimeStep, \
 9 | 			int&       r_IsFinite, \
10 | 			int&       r_TerminateSimulation, \
11 | 			Precision& r_NewTimeStep)
12 | {
13 | 	if ( r_IsFinite == 0 )
14 | 	{
15 | 		printf("Error: State is not a finite number. Try to use smaller step size. (thread id: %d)\n", tid);
16 | 		r_TerminateSimulation = 1;
17 | 	}
18 | 	
19 | 	r_NewTimeStep = InitialTimeStep;
20 | }
21 | 
22 | 
23 | template <int SD, class Precision>
24 | __forceinline__ __device__ void PerThread_ErrorController_RKCK45(\
25 | 			int        tid, \
26 | 			Precision  r_TimeStep, \
27 | 			Precision* r_ActualState, \
28 | 			Precision* r_NextState, \
29 | 			Precision* r_Error, \
30 | 			Precision* s_RelativeTolerance, \
31 | 			Precision* s_AbsoluteTolerance, \
32 | 			int&       r_UpdateStep, \
33 | 			int&       r_IsFinite, \
34 | 			int&       r_TerminateSimulation, \
35 | 			Precision& r_NewTimeStep, \
36 | 			Struct_SolverOptions<Precision> SolverOptions)
37 | {
38 | 	Precision RelativeError = 1e30;
39 | 	Precision ErrorTolerance;
40 | 	Precision TimeStepMultiplicator;
41 | 	
42 | 	for (int i=0; i<SD; i++)
43 | 	{
44 | 		ErrorTolerance = MPGOS::FMAX( s_RelativeTolerance[i]*MPGOS::FMAX( MPGOS::FABS(r_NextState[i]), MPGOS::FABS(r_ActualState[i])), s_AbsoluteTolerance[i] );
45 | 		r_UpdateStep   = r_UpdateStep && ( r_Error[i] < ErrorTolerance );
46 | 		RelativeError  = MPGOS::FMIN( RelativeError, ErrorTolerance / r_Error[i] );
47 | 	}
48 | 	
49 | 	
50 | 	if ( r_UpdateStep == 1 )
51 | 		TimeStepMultiplicator = static_cast<Precision>(0.9) * pow(RelativeError, static_cast<Precision>(1.0/5.0) );
52 | 	else
53 | 		TimeStepMultiplicator = static_cast<Precision>(0.9) * pow(RelativeError, static_cast<Precision>(1.0/4.0) );
54 | 	
55 | 	if ( isfinite(TimeStepMultiplicator) == 0 )
56 | 		r_IsFinite = 0;
57 | 	
58 | 	
59 | 	if ( r_IsFinite == 0 )
60 | 	{
61 | 		TimeStepMultiplicator = SolverOptions.TimeStepShrinkLimit;
62 | 		r_UpdateStep = 0;
63 | 		
64 | 		if ( r_TimeStep < (SolverOptions.MinimumTimeStep*static_cast<Precision>(1.01)) )
65 | 		{
66 | 			printf("Error: State is not a finite number even with the minimal step size. Try to use less stringent tolerances. (thread id: %d)\n", tid);
67 | 			r_TerminateSimulation = 1;
68 | 		}
69 | 	} else
70 | 	{
71 | 		if ( r_TimeStep < (SolverOptions.MinimumTimeStep*static_cast<Precision>(1.01)) )
72 | 		{
73 | 			printf("Warning: Minimum step size reached! Continue with fixed minimum step size! Tolerance cannot be guaranteed!, thread id: %d, time step: %+6.5e, min step size: %+6.5e \n", tid, r_TimeStep, SolverOptions.MinimumTimeStep);
74 | 			r_UpdateStep = 1;
75 | 		}
76 | 	}
77 | 	
78 | 	
79 | 	TimeStepMultiplicator = MPGOS::FMIN(TimeStepMultiplicator, SolverOptions.TimeStepGrowLimit);
80 | 	TimeStepMultiplicator = MPGOS::FMAX(TimeStepMultiplicator, SolverOptions.TimeStepShrinkLimit);
81 | 	
82 | 	r_NewTimeStep = r_TimeStep * TimeStepMultiplicator;
83 | 	
84 | 	r_NewTimeStep = MPGOS::FMIN(r_NewTimeStep, SolverOptions.MaximumTimeStep);
85 | 	r_NewTimeStep = MPGOS::FMAX(r_NewTimeStep, SolverOptions.MinimumTimeStep);
86 | }
87 | 
88 | #endif


--------------------------------------------------------------------------------
/GPU_ODE_JAX/bench_diffrax.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | # %%
  4 | # Benchmarking Diffrax ODE solvers for ensemble problems, via vmap. The Lorenz ODE is integrated by Tsit5.
  5 | 
  6 | # Created By: Utkarsh
  7 | # Last Updated: 19 April 2023
  8 | 
  9 | 
 10 | # %%
 11 | import time
 12 | 
 13 | import diffrax
 14 | import equinox as eqx
 15 | import jax
 16 | import jax.numpy as jnp
 17 | import numpy as np
 18 | import os
 19 | import timeit
 20 | import sys
 21 | 
 22 | numberOfParameters = int(sys.argv[1])
 23 | 
 24 | # %%
 25 | 
 26 | 
 27 | from jax.lib import xla_bridge
 28 | print("Working on :", xla_bridge.get_backend().platform)
 29 | 
 30 | 
 31 | # %%
 32 | # Defining the Lorenz Problem
 33 | class Lorenz(eqx.Module):
 34 |     k1: float
 35 | 
 36 |     def __call__(self, t, y, args):
 37 |         f0 = 10.0*(y[1] - y[0])
 38 |         f1 = self.k1 * y[0] - y[1] - y[0] * y[2]
 39 |         f2 = y[0] * y[1] - (8/3)*y[2]
 40 |         return jnp.stack([f0, f1, f2])
 41 | 
 42 | 
 43 | # %%
 44 | # JIT compilation of ODE solver
 45 | @jax.jit
 46 | @jax.vmap
 47 | def main(k1):
 48 |     lorenz = Lorenz(k1)
 49 |     terms = diffrax.ODETerm(lorenz)
 50 |     t0 = 0.0
 51 |     t1 = 1.0
 52 |     y0 = jnp.array([1.0, 0.0, 0.0])
 53 |     dt0 = 0.001
 54 |     solver = diffrax.Tsit5()
 55 |     saveat = diffrax.SaveAt(ts = jnp.array([t0,t1]))
 56 |     stepsize_controller = diffrax.PIDController(rtol=1e-6, atol=1e-3)
 57 |     sol = diffrax.diffeqsolve(
 58 |         terms,
 59 |         solver,
 60 |         t0,
 61 |         t1,
 62 |         dt0,
 63 |         y0,
 64 |     )
 65 |     return sol
 66 | 
 67 | # %%
 68 | # Setting up parameters for parallel simulation
 69 | parameterList = jnp.linspace(0.0,21.0,numberOfParameters)
 70 | 
 71 | # Test that vmap and JIT ordering does not make a noticeable difference:
 72 | # https://colab.research.google.com/drive/1d7G-O5JX31lHbg7jTzzozbo5-Gp7DBEv?usp=sharing
 73 | 
 74 | # %%
 75 | # Use jax.vmap to compute parallel solutions of the ODE
 76 | res = timeit.repeat(lambda: main(parameterList),repeat = 100,number = 1)
 77 | 
 78 | best_time  = min(res)*1000
 79 | print("{:} ODE solves with fixed time-stepping completed in {:.1f} ms".format(numberOfParameters, best_time))
 80 | 
 81 | 
 82 | # %%
 83 | # Save the minimum time 
 84 | file = open("./data/JAX/Jax_times_unadaptive.txt","a+")
 85 | file.write('{0} {1}\n'.format(numberOfParameters, best_time))
 86 | file.close()
 87 | 
 88 | 
 89 | # %%
 90 | # Repeat the same for adaptive time-stepping
 91 | @jax.jit
 92 | @jax.vmap
 93 | def main(k1):
 94 |     lorenz = Lorenz(k1)
 95 |     terms = diffrax.ODETerm(lorenz)
 96 |     t0 = 0.0
 97 |     t1 = 1.0
 98 |     y0 = jnp.array([1.0, 0.0, 0.0])
 99 |     dt0 = 0.001
100 |     solver = diffrax.Tsit5()
101 |     saveat = diffrax.SaveAt(ts = jnp.array([t0,t1]))
102 |     stepsize_controller = diffrax.PIDController(rtol=1e-8, atol=1e-8)
103 |     sol = diffrax.diffeqsolve(
104 |         terms,
105 |         solver,
106 |         t0,
107 |         t1,
108 |         dt0,
109 |         y0,
110 | #         saveat=saveat,
111 |         stepsize_controller=stepsize_controller,
112 |     )
113 |     return sol
114 | 
115 | 
116 | # %%
117 | 
118 | 
119 | import timeit
120 | 
121 | 
122 | # %%
123 | 
124 | 
125 | res = timeit.repeat(lambda: main(parameterList),repeat = 100,number = 1)
126 | 
127 | 
128 | # %%
129 | 
130 | best_time  = min(res)*1000
131 | print("{:} ODE solves with adaptive time-stepping completed in {:.1f} ms".format(numberOfParameters, best_time))
132 | 
133 | 
134 | # %%
135 | 
136 | 
137 | file = open("./data/JAX/Jax_times_adaptive.txt","a+")
138 | file.write('{0} {1}\n'.format(numberOfParameters, best_time))
139 | file.close()
140 | 
141 | 


--------------------------------------------------------------------------------
/runner_scripts/plot/plot_cpu_comp.jl:
--------------------------------------------------------------------------------
  1 | using Plots
  2 | using DelimitedFiles
  3 | using Dates
  4 | using Statistics
  5 | 
  6 | gr(size = (720, 480))
  7 | 
  8 | times = Dict()
  9 | 
 10 | parent_dir =
 11 |     length(ARGS) != 0 ? joinpath(ARGS[1], "data") : joinpath("paper_artifacts", "data")
 12 | 
 13 | base_path = joinpath(dirname(dirname(@__DIR__)), parent_dir, "Julia")
 14 | 
 15 | if length(ARGS) != 0
 16 |     Julia_data = readdlm(joinpath(base_path, "Julia_times_unadaptive.txt"))
 17 | else
 18 |     Julia_data = readdlm(
 19 |         joinpath(
 20 |             dirname(dirname(@__DIR__)),
 21 |             parent_dir,
 22 |             "Tesla_V100",
 23 |             "Julia",
 24 |             "Julia_times_unadaptive.txt",
 25 |         ),
 26 |     )
 27 | end
 28 | 
 29 | GPU_times = Julia_data[:, 2][1:9] .* 1e-3
 30 | Ns = Julia_data[:, 1][1:9]
 31 | 
 32 | Julia_EGArray_data = readdlm(
 33 |     joinpath(base_path, "EnsembleGPUArray", "Julia_EnGPUArray_times_unadaptive.txt"),
 34 | )
 35 | 
 36 | GPU_EGArray_times = Julia_EGArray_data[:, 2][1:9] .* 1e-3
 37 | 
 38 | CPU_data = readdlm(joinpath(base_path, "CPU", "times_unadaptive.txt"))
 39 | 
 40 | CPU_times = CPU_data[:, 2] .* 1e-3
 41 | 
 42 | times["Fixed_CPU"] = mean(CPU_times ./ GPU_times)
 43 | 
 44 | times["Fixed_GPU"] = mean(GPU_times ./ GPU_times)
 45 | 
 46 | times["Fixed_GPU_vmap"] = mean(GPU_EGArray_times ./ GPU_times)
 47 | 
 48 | xticks = 10 .^ round.(range(1, 7, length = 13), digits = 2)
 49 | 
 50 | yticks = 10 .^ round.(range(2, -5, length = 15), digits = 2)
 51 | 
 52 | plt = plot(
 53 |     Ns,
 54 |     GPU_times,
 55 |     xaxis = :log,
 56 |     yaxis = :log,
 57 |     linewidth = 2,
 58 |     label = "EnsembleGPUKernel: Fixed dt",
 59 |     ylabel = "Time (s)",
 60 |     xlabel = "Trajectories",
 61 |     title = "Bechmarking the Lorenz Problem",
 62 |     legend = :topleft,
 63 |     xticks = xticks,
 64 |     yticks = yticks,
 65 |     marker = :circle,
 66 |     dpi = 600,
 67 |     color = :Green,
 68 | )
 69 | 
 70 | 
 71 | plt = plot!(
 72 |     Ns,
 73 |     CPU_times,
 74 |     xaxis = :log,
 75 |     yaxis = :log,
 76 |     linewidth = 2,
 77 |     label = "CPU: Fixed dt",
 78 |     marker = :circle,
 79 |     color = :Orange,
 80 | )
 81 | 
 82 | plt = plot!(
 83 |     Ns,
 84 |     GPU_EGArray_times,
 85 |     xaxis = :log,
 86 |     yaxis = :log,
 87 |     linewidth = 2,
 88 |     label = "EnsembleGPUArray: Fixed dt",
 89 |     marker = :circle,
 90 |     color = :Red,
 91 | )
 92 | 
 93 | 
 94 | plots_dir = joinpath(dirname(dirname(@__DIR__)), "plots")
 95 | 
 96 | isdir(plots_dir) || mkdir(plots_dir)
 97 | 
 98 | 
 99 | if length(ARGS) != 0
100 |     Julia_data = readdlm(joinpath(base_path, "Julia_times_adaptive.txt"))
101 | else
102 |     Julia_data = readdlm(
103 |         joinpath(
104 |             dirname(dirname(@__DIR__)),
105 |             parent_dir,
106 |             "Tesla_V100",
107 |             "Julia",
108 |             "Julia_times_adaptive.txt",
109 |         ),
110 |     )
111 | end
112 | 
113 | GPU_times = Julia_data[:, 2][1:9] .* 1e-3
114 | Ns = Julia_data[:, 1][1:9]
115 | 
116 | Julia_EGArray_data =
117 |     readdlm(joinpath(base_path, "EnsembleGPUArray", "Julia_EnGPUArray_times_adaptive.txt"))
118 | 
119 | GPU_EGArray_times = Julia_EGArray_data[:, 2][1:9] .* 1e-3
120 | 
121 | CPU_data = readdlm(joinpath(base_path, "CPU", "times_unadaptive.txt"))
122 | 
123 | CPU_times = CPU_data[:, 2] .* 1e-3
124 | 
125 | times["Adaptive_CPU"] = mean(CPU_times ./ GPU_times)
126 | 
127 | times["Adaptive_GPU"] = mean(GPU_times ./ GPU_times)
128 | 
129 | times["Adaptive_GPU_vmap"] = mean(GPU_EGArray_times ./ GPU_times)
130 | 
131 | 
132 | plt = plot!(
133 |     Ns,
134 |     GPU_times,
135 |     xaxis = :log,
136 |     yaxis = :log,
137 |     linewidth = 2,
138 |     marker = :ltriangle,
139 |     dpi = 600,
140 |     color = :Green,
141 |     label = "EnsembleGPUKernel: Adaptive dt",
142 | )
143 | 
144 | plt = plot!(
145 |     Ns,
146 |     CPU_times,
147 |     xaxis = :log,
148 |     yaxis = :log,
149 |     linewidth = 2,
150 |     label = "CPU: Adaptive dt",
151 |     marker = :ltriangle,
152 |     color = :Orange,
153 | )
154 | 
155 | plt = plot!(
156 |     Ns,
157 |     GPU_EGArray_times,
158 |     xaxis = :log,
159 |     yaxis = :log,
160 |     linewidth = 2,
161 |     label = "EnsembleGPUArray: Adaptive dt",
162 |     marker = :ltriangle,
163 |     color = :Red,
164 | )
165 | 
166 | savefig(plt, joinpath(plots_dir, "CPU_Lorenz_adaptive_$(Dates.value(Dates.now())).png"))
167 | 


--------------------------------------------------------------------------------
/texture_memory/test_texturemem.jl:
--------------------------------------------------------------------------------
  1 | using CUDA, DiffEqGPU, OrdinaryDiffEq, Plots, Serialization, StaticArrays, Distributions, LinearAlgebra
  2 | import DataInterpolations
  3 | const DI = DataInterpolations
  4 | 
  5 | trajectories = 100
  6 | u0 = @SVector [0.0f0, 0.0f0, 10000.0f0, 0f0, 0f0, 0f0]
  7 | tspan = (0.0f0, 50.0f0)
  8 | saveat = LinRange(tspan..., 100)
  9 | p = @SVector [25f0, 225f0, 9.807f0]
 10 | CdS_dist = Normal(0f0, 1f0)
 11 | 
 12 | ## Example where interpolation is performed on GPU
 13 | 
 14 | data = deserialize("forecast.txt")
 15 | N = length(data.altitude)
 16 | 
 17 | weather_sa = map(data.altitude, data.windx, data.windy, data.density) do alt, wx, wy, ρ
 18 |     SVector{4}(alt, wx, wy, ρ)
 19 | end
 20 | 
 21 | data = deserialize("forecast.txt")
 22 | N = length(data.altitude)
 23 | 
 24 | weather_sa = map(data.altitude, data.windx, data.windy, data.density) do alt, wx, wy, ρ
 25 |     SVector{4}(alt, wx, wy, ρ)
 26 | end
 27 | 
 28 | weather_sa = SVector{length(weather_sa)}(weather_sa)
 29 | 
 30 | interp = DI.LinearInterpolation{true}(hcat(weather_sa...),data.altitude)
 31 | 
 32 | function get_weather(itp::DI.LinearInterpolation, z)
 33 |     weather = itp(z)
 34 |     wind = SVector{3}(weather[2], weather[3], 0f0)
 35 |     ρ = weather[4]
 36 |     wind, ρ
 37 | end
 38 | 
 39 | 
 40 | ### solving the ODE on GPU + Interpolation using DataInterpolations
 41 | 
 42 | function ballistic_gpu(u, p, t)
 43 |     CdS, mass, g = p[1]
 44 |     interp = p[2]
 45 |     vel = @view u[4:6]
 46 | 
 47 |     wind, ρ = get_weather(interp, u[3])
 48 | 
 49 |     airvelocity = vel - wind
 50 |     airspeed = norm(airvelocity)
 51 |     accel = -(ρ * CdS * airspeed) / (2 * mass) * airvelocity - mass*SVector{3}(0f0, 0f0, g)
 52 | 
 53 |     return SVector{6}(vel..., accel...)
 54 | end
 55 | 
 56 | 
 57 | prob_interp = ODEProblem{false}(ballistic_gpu, u0, tspan, (p, interp))
 58 | 
 59 | prob_func = (prob, i, repeat) -> remake(prob_interp, p = (p + SVector{3}(rand(CdS_dist), 0f0, 0f0), interp))
 60 | eprob_interp = EnsembleProblem(prob_interp, prob_func = prob_func, safetycopy = false)
 61 | 
 62 | esol_gpu = solve(eprob_interp, GPUTsit5(), EnsembleGPUKernel(CUDABackend(), 0.0); trajectories, saveat)
 63 | 
 64 | using BenchmarkTools
 65 | 
 66 | @benchmark esol_gpu = solve(eprob_interp, GPUTsit5(), EnsembleGPUKernel(CUDABackend(), 0.0); trajectories, saveat)
 67 | 
 68 | 
 69 | ## Replace interpolation with textured-memory
 70 | 
 71 | 
 72 | weather = map(weather_sa) do w
 73 |     (w...,)
 74 | end
 75 | 
 76 | weather_TA = CuTextureArray(weather)
 77 | texture = CuTexture(weather_TA; address_mode = CUDA.ADDRESS_MODE_CLAMP, normalized_coordinates = true, interpolation = CUDA.LinearInterpolation())
 78 | 
 79 | ## Test Texture interpolation
 80 | idx = LinRange(0f0, 1f0, 4000)
 81 | idx_gpu = CuArray(idx)
 82 | idx_tlu = (1f0-1f0/N)*idx_gpu .+ 0.5f0/N  # normalized table lookup form https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#table-lookup
 83 | dst_gpu = CuArray{NTuple{4, Float32}}(undef, size(idx))
 84 | dst_gpu .= getindex.(Ref(texture), idx_tlu)  # interpolation ℝ->ℝ⁴
 85 | 
 86 | 
 87 | def_zmax = data.altitude[end]
 88 | N = length(data.altitude)
 89 | @inline function get_weather(tex, z, zmax, N)
 90 |     idx = (1f0-1f0/N)*z/zmax + 0.5f0/N # normalized input for table lookup based on https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#table-lookup
 91 |     weather = tex[idx]
 92 |     wind = SVector{3}(weather[2], weather[3], 0f0)
 93 |     ρ = weather[4]
 94 |     wind, ρ
 95 | end
 96 | 
 97 | ### Experimentation
 98 | 
 99 | function ballistic_t(u, p, t)
100 |     CdS, mass, g = p[1]
101 |     interp = p[2]
102 |     zmax = p[3]
103 |     N = p[4]
104 |     vel = @view u[4:6]
105 | 
106 |     wind, ρ = get_weather(interp, u[3], zmax, N)
107 | 
108 |     airvelocity = vel - wind
109 |     airspeed = norm(airvelocity)
110 |     accel = -(ρ * CdS * airspeed) / (2 * mass) * airvelocity - mass*SVector{3}(0f0, 0f0, g)
111 | 
112 |     return SVector{6}(vel..., accel...)
113 | end
114 | 
115 | prob_tx = ODEProblem(ballistic_t, u0, tspan, (p, texture, def_zmax, N))
116 | 
117 | using Adapt
118 | 
119 | function Adapt.adapt_structure(to::CUDA.Adaptor, x::CuArray{<:ODEProblem})
120 |     # first convert the contained ODE problems
121 |     y = CuArray(adapt.(Ref(to), Array(x)))
122 |     # continue doing what the default method does
123 |     Base.unsafe_convert(CuDeviceArray{eltype(y),ndims(y),CUDA.AS.Global}, y)
124 | end
125 | 
126 | prob_func = (prob, i, repeat) -> remake(prob, p = (p + SVector{3}(rand(CdS_dist), 0f0, 0f0), texture, def_zmax, N))
127 | eprob_texture = EnsembleProblem(prob_tx, prob_func = prob_func, safetycopy = false)
128 | 
129 | esol_gpu = solve(eprob_texture, GPUTsit5(), EnsembleGPUKernel(CUDABackend(), 0.0); trajectories, saveat)
130 | 
131 | @benchmark esol_gpu = solve(eprob_texture, GPUTsit5(), EnsembleGPUKernel(CUDABackend(), 0.0); trajectories, saveat)
132 | 


--------------------------------------------------------------------------------
/GPU_ODE_Julia/sde_examples/bench_crn_model.jl:
--------------------------------------------------------------------------------
  1 | 
  2 | using Catalyst, Plots, StochasticDiffEq, StaticArrays, CUDA
  3 | 
  4 | @show ARGS
  5 | #settings
  6 | 
  7 | N = isinteractive() ? 10 : parse(Int64, ARGS[1])
  8 | 
  9 | ### Example 4: Ensemble SDE simulations (varioous parameter values) at steady state behaviours of 4 variable CRN (Generalised bacterial stress response model). ###
 10 | 
 11 | ## Credits: Torkel Loman
 12 | 
 13 | # Declare the model (using Catalyst).
 14 | σGen_system = @reaction_network begin
 15 |     (v0 + (S * σ)^n / ((S * σ)^n + (D * A3)^n + 1), 1.0), ∅ ↔ σ
 16 |     (σ / τ, 1 / τ), ∅ ↔ A1
 17 |     (A1 / τ, 1 / τ), ∅ ↔ A2
 18 |     (A2 / τ, 1 / τ), ∅ ↔ A3
 19 | end S D τ v0 n η
 20 | 
 21 | # Declares the parameter values.
 22 | σGen_parameters = [:S => 2.3, :D => 5.0, :τ => 10.0, :v0 => 0.1, :n => 3, :η => 0.1]
 23 | 
 24 | # Set ensemble parameter values.
 25 | S_grid = Float32.(10 .^ (range(-1.0, stop = 2, length = N)))
 26 | D_grid = Float32.(10 .^ (range(-1, stop = 2, length = N)))
 27 | τ_grid = Float32[0.1, 0.15, 0.20, 0.30, 0.50, 0.75, 1.0, 1.5, 2.0, 3.0, 5.0, 7.50, 10.0,
 28 |                  15.0, 20.0, 30.0, 50.0, 75.0, 100.0][1:2:19]
 29 | v0_grid = Float32[0.01, 0.02, 0.03, 0.05, 0.075, 0.1, 0.15, 0.20]
 30 | n_grid = Float32[2.0, 3.0, 4.0]
 31 | η_grid = Float32[0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1]
 32 | 
 33 | parameters = collect(Iterators.product(S_grid, D_grid, τ_grid, v0_grid, n_grid, η_grid));
 34 | 
 35 | numberOfParameters = length(parameters)
 36 | 
 37 | @show numberOfParameters
 38 | 
 39 | function σGen_p_func(prob, i, repeat)
 40 |     for parameter in parameters
 41 |         remake(prob; p = SVector{6, T1}(parameter))
 42 |     end
 43 | end
 44 | 
 45 | # Declare initial condition.
 46 | σGen_u0 = [:σ => 0.1, :A1 => 0.1, :A2 => 0.1, :A3 => 0.1] # (for some S values, the system will start far away from the steady state).
 47 | 
 48 | # Create EnsembleProblem.
 49 | σGen_sprob = SDEProblem(σGen_system, σGen_u0, (0.0, 1000.0), σGen_parameters,
 50 |                         noise_scaling = (@parameters η)[1])
 51 | 
 52 | ### Experimentation
 53 | sys = modelingtoolkitize(σGen_sprob)
 54 | T1 = Float32
 55 | prob = SDEProblem{false}(sys, SVector{length(σGen_sprob.u0), T1}(σGen_sprob.u0),
 56 |                          Float32.(σGen_sprob.tspan),
 57 |                          SVector{length(σGen_sprob.p), T1}(σGen_sprob.p),
 58 |                          noise_rate_prototype = SMatrix{
 59 |                                                         size(σGen_sprob.noise_rate_prototype)...,
 60 |                                                         T1}(σGen_sprob.noise_rate_prototype))
 61 | 
 62 | using DiffEqGPU
 63 | 
 64 | # parameter as cartesian product of the ranges, initial condition as [v0,v0,v0,v0]
 65 | function prob_func(prob, i, repeat)
 66 |     remake(prob; p = SVector{6, T1}(parameters[i]...),
 67 |            u0 = SVector{4, T1}(parameters[i][4], parameters[i][4], parameters[i][4],
 68 |                                parameters[i][4]))
 69 | end
 70 | 
 71 | eprob = EnsembleProblem(prob, prob_func = prob_func, safetycopy = false)
 72 | 
 73 | saveat = T1(0.0f0):T1(1.0f0):T1(1000.0f0)
 74 | dt = T1(0.1f0)
 75 | 
 76 | probs = map(1:numberOfParameters) do i
 77 |     prob_func(prob, i, false)
 78 | end;
 79 | 
 80 | ### Benchmarking
 81 | using BenchmarkTools
 82 | 
 83 | @info "Solving the problem: GPU"
 84 | 
 85 | ## Move the arrays to the GPU
 86 | gpuprobs = cu(probs);
 87 | 
 88 | data = @benchmark CUDA.@sync DiffEqGPU.vectorized_solve($gpuprobs, $prob, GPUEM();
 89 |                                                         save_everystep = false,
 90 |                                                         dt = 0.1f0)
 91 | 
 92 | if !isinteractive()
 93 |     open(joinpath(dirname(dirname(@__DIR__)), "data", "SDE", "CRN",
 94 |                   "Julia_times_unadaptive.txt"), "a+") do io
 95 |         println(io, numberOfParameters, " ", minimum(data.times) / 1e6)
 96 |     end
 97 | end
 98 | println("Parameter number: " * string(numberOfParameters))
 99 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms")
100 | println("Allocs: " * string(data.allocs))
101 | 
102 | @info "Solving the problem: CPU"
103 | 
104 | data = @benchmark solve($eprob, EM(), EnsembleThreads(), dt = 0.1f0, adaptive = false,
105 |                         save_everystep = false, trajectories = numberOfParameters)
106 | 
107 | # data = @benchmark DiffEqGPU.vectorized_solve($probs, $prob, GPUEM();
108 | #                                              save_everystep = false,
109 | #                                              dt = 0.1f0)
110 | 
111 | if !isinteractive()
112 |     open(joinpath(dirname(dirname(@__DIR__)), "data", "CPU", "SDE", "CRN",
113 |                   "Julia_times_unadaptive.txt"), "a+") do io
114 |         println(io, numberOfParameters, " ", minimum(data.times) / 1e6)
115 |     end
116 | end
117 | 
118 | println("Parameter number: " * string(numberOfParameters))
119 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms")
120 | println("Allocs: " * string(data.allocs))
121 | 


--------------------------------------------------------------------------------
/GPU_ODE_Julia/bench_multi_device.jl:
--------------------------------------------------------------------------------
  1 | 
  2 | using DiffEqGPU, BenchmarkTools, StaticArrays, SimpleDiffEq
  3 | 
  4 | @show ARGS
  5 | #settings
  6 | 
  7 | numberOfParameters = isinteractive() ? 8192 : parse(Int64, ARGS[1])
  8 | 
  9 | function lorenz(u, p, t)
 10 |     du1 = 10.0f0 * (u[2] - u[1])
 11 |     du2 = p[1] * u[1] - u[2] - u[1] * u[3]
 12 |     du3 = u[1] * u[2] - 2.666f0 * u[3]
 13 |     return @SVector [du1, du2, du3]
 14 | end
 15 | 
 16 | u0 = @SVector [1.0f0; 0.0f0; 0.0f0]
 17 | tspan = (0.0f0, 1.0f0)
 18 | p = @SArray [21.0f0]
 19 | prob = ODEProblem(lorenz, u0, tspan, p)
 20 | 
 21 | parameterList = range(0.0f0, stop = 21.0f0, length = numberOfParameters)
 22 | 
 23 | lorenzProblem = ODEProblem(lorenz, u0, tspan, p)
 24 | prob_func = (prob, i, repeat) -> remake(prob, p = @SArray [parameterList[i]])
 25 | 
 26 | ensembleProb = EnsembleProblem(lorenzProblem, prob_func = prob_func)
 27 | 
 28 | ## Building problems here only
 29 | I = 1:numberOfParameters
 30 | if ensembleProb.safetycopy
 31 |     probs = map(I) do i
 32 |         ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1)
 33 |     end
 34 | else
 35 |     probs = map(I) do i
 36 |         ensembleProb.prob_func(ensembleProb.prob, i, 1)
 37 |     end
 38 | end
 39 | 
 40 | ## Make them compatible with Backend
 41 | probs = if ARGS[2] == "CUDA"
 42 |     using CUDA
 43 |     cu(probs)
 44 | elseif ARGS[2] == "oneAPI"
 45 |     using oneAPI
 46 |     probs |> oneArray
 47 | elseif ARGS[2] == "AMDGPU"
 48 |     using AMDGPU
 49 |     roc(probs)
 50 | elseif ARGS[2] == "Metal"
 51 |     using Metal
 52 |     probs |> MtlArray
 53 | end
 54 | 
 55 | @info "Solving the problem"
 56 | data = if ARGS[2] == "CUDA"
 57 |     @benchmark CUDA.@sync DiffEqGPU.vectorized_solve($probs, $ensembleProb.prob,
 58 |                                                      GPUTsit5();
 59 |                                                      save_everystep = false,
 60 |                                                      dt = 0.001f0)
 61 | elseif ARGS[2] == "oneAPI"
 62 |     @benchmark oneAPI.@sync DiffEqGPU.vectorized_solve($probs, $ensembleProb.prob,
 63 |                                                        GPUTsit5();
 64 |                                                        save_everystep = false,
 65 |                                                        dt = 0.001f0)
 66 | elseif ARGS[2] == "AMDGPU"
 67 |     @benchmark DiffEqGPU.vectorized_solve($probs, $ensembleProb.prob,
 68 |                                           GPUTsit5();
 69 |                                           save_everystep = false,
 70 |                                           dt = 0.001f0)
 71 | elseif ARGS[2] == "Metal"
 72 |     @benchmark Metal.@sync DiffEqGPU.vectorized_solve($probs, $ensembleProb.prob,
 73 |                                                       GPUTsit5();
 74 |                                                       save_everystep = false,
 75 |                                                       dt = 0.001f0)
 76 | end
 77 | 
 78 | if !isinteractive()
 79 |     open(joinpath(dirname(@__DIR__), "data", "devices", ARGS[2],
 80 |                   "Julia_times_unadaptive.txt"), "a+") do io
 81 |         println(io, numberOfParameters, " ", minimum(data.times) / 1e6)
 82 |     end
 83 | end
 84 | 
 85 | println("Parameter number: " * string(numberOfParameters))
 86 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms")
 87 | println("Allocs: " * string(data.allocs))
 88 | 
 89 | data = if ARGS[2] == "CUDA"
 90 |     @benchmark CUDA.@sync DiffEqGPU.vectorized_asolve($probs, $ensembleProb.prob,
 91 |                                                       GPUTsit5();
 92 |                                                       dt = 0.001f0, reltol = 1.0f-8,
 93 |                                                       abstol = 1.0f-8)
 94 | 
 95 | elseif ARGS[2] == "oneAPI"
 96 |     @benchmark oneAPI.@sync DiffEqGPU.vectorized_asolve($probs, $ensembleProb.prob,
 97 |                                                         GPUTsit5();
 98 |                                                         dt = 0.001f0, reltol = 1.0f-8,
 99 |                                                         abstol = 1.0f-8)
100 | 
101 | elseif ARGS[2] == "AMDGPU"
102 |     @benchmark DiffEqGPU.vectorized_asolve($probs, $ensembleProb.prob,
103 |                                            GPUTsit5();
104 |                                            dt = 0.001f0, reltol = 1.0f-8,
105 |                                            abstol = 1.0f-8)
106 | 
107 | elseif ARGS[2] == "Metal"
108 |     @benchmark Metal.@sync DiffEqGPU.vectorized_asolve($probs, $ensembleProb.prob,
109 |                                                        GPUTsit5();
110 |                                                        dt = 0.001f0, reltol = 1.0f-8,
111 |                                                        abstol = 1.0f-8)
112 | end
113 | 
114 | if !isinteractive()
115 |     open(joinpath(dirname(@__DIR__), "data", "devices", ARGS[2],
116 |                   "Julia_times_adaptive.txt"), "a+") do io
117 |         println(io, numberOfParameters, " ", minimum(data.times) / 1e6)
118 |     end
119 | end
120 | 
121 | println("Parameter number: " * string(numberOfParameters))
122 | println("Minimum time: " * string(minimum(data.times) / 1e6) * " ms")
123 | println("Allocs: " * string(data.allocs))
124 | 


--------------------------------------------------------------------------------
/GPU_ODE_MPGOS/Lorenz.cu:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <iomanip>
  3 | #include <vector>
  4 | #include <string>
  5 | #include <fstream>
  6 | 
  7 | #include "Lorenz_SystemDefinition.cuh"
  8 | #include "SingleSystem_PerThread_Interface.cuh"
  9 | 
 10 | #define PI 3.14159265358979323846
 11 | 
 12 | using namespace std;
 13 | 
 14 | // Solver Configuration
 15 | #define SOLVER RKCK45
 16 | #define PRECISION float  // float, double
 17 | const int NT = 8;
 18 | const int SD   = 3;     // SystemDimension
 19 | const int NCP  = 1;     // NumberOfControlParameters
 20 | const int NSP  = 0;     // NumberOfSharedParameters
 21 | const int NISP = 0;     // NumberOfIntegerSharedParameters
 22 | const int NE   = 0;     // NumberOfEvents
 23 | const int NA   = 0;     // NumberOfAccessories
 24 | const int NIA  = 0;     // NumberOfIntegerAccessories
 25 | const int NDO  = 0;     // NumberOfPointsOfDenseOutput
 26 | 
 27 | void Linspace(vector<PRECISION>&, PRECISION, PRECISION, int);
 28 | void FillSolverObject(ProblemSolver<NT,SD,NCP,NSP,NISP,NE,NA,NIA,NDO,SOLVER,PRECISION>&, const vector<PRECISION>&, int);
 29 | void SaveData(ProblemSolver<NT,SD,NCP,NSP,NISP,NE,NA,NIA,NDO,SOLVER,PRECISION>&, int);
 30 | 
 31 | int main(int argc, char *argv[])
 32 | {
 33 | 	int NumberOfProblems = NT;
 34 | 	int BlockSize        = 1024;
 35 | 	
 36 | 	ListCUDADevices();
 37 | 	
 38 | 	int MajorRevision  = 3;
 39 | 	int MinorRevision  = 5;
 40 | 	int SelectedDevice = SelectDeviceByClosestRevision(MajorRevision, MinorRevision);
 41 | 	
 42 | 	PrintPropertiesOfSpecificDevice(SelectedDevice);
 43 | 	
 44 | 	
 45 | 	int NumberOfParameters_R = NumberOfProblems;
 46 | 	PRECISION R_RangeLower = 0.0;
 47 |     PRECISION R_RangeUpper = 21.0;
 48 | 		vector<PRECISION> Parameters_R_Values(NumberOfParameters_R,0);
 49 | 		Linspace(Parameters_R_Values, R_RangeLower, R_RangeUpper, NumberOfParameters_R);
 50 | 	
 51 | 	
 52 | 	ProblemSolver<NT,SD,NCP,NSP,NISP,NE,NA,NIA,NDO,SOLVER,PRECISION> ScanLorenz(SelectedDevice);
 53 | 	
 54 | 	ScanLorenz.SolverOption(ThreadsPerBlock, BlockSize);
 55 | 	ScanLorenz.SolverOption(InitialTimeStep, 1.0e-3);
 56 | 	
 57 | 	
 58 | 	clock_t SimulationStart;
 59 | 	clock_t SimulationEnd;
 60 | 	
 61 | 	FillSolverObject(ScanLorenz, Parameters_R_Values, NT);
 62 | 	
 63 | 	ScanLorenz.SynchroniseFromHostToDevice(All);
 64 | 	
 65 | 	SimulationStart = clock();
 66 | 	ScanLorenz.Solve();
 67 | 	ScanLorenz.InsertSynchronisationPoint();
 68 | 	ScanLorenz.SynchroniseSolver();
 69 | 	SimulationEnd = clock();
 70 | 	
 71 | 	ScanLorenz.SynchroniseFromDeviceToHost(All);
 72 | 	
 73 | 	cout << "Total simulation time:           " << 1000.0*(SimulationEnd-SimulationStart) / CLOCKS_PER_SEC << "ms" << endl;
 74 | 	cout << "Simulation time / 1000 RK4 step: " << 1000.0*(SimulationEnd-SimulationStart) / CLOCKS_PER_SEC << "ms" << endl;
 75 | 	cout << "Ensemble size:                   " << NT << endl << endl;
 76 | 		
 77 | 	
 78 | 	ofstream datafile;
 79 | 	if (SOLVER == RK4){
 80 | 		datafile.open ("./data/cpp/MPGOS_times_unadaptive.txt",ios::app);
 81 | 		datafile << NT << "\t"<< 1000.0*(SimulationEnd-SimulationStart) / CLOCKS_PER_SEC <<"\n";
 82 | 		datafile.close();
 83 | 	}else{
 84 | 		
 85 | 		datafile.open ("./data/cpp/MPGOS_times_adaptive.txt",ios::app);
 86 | 		datafile << NT << "\t"<< 1000.0*(SimulationEnd-SimulationStart) / CLOCKS_PER_SEC <<"\n";
 87 | 		datafile.close();
 88 | 	}
 89 | 	
 90 | 	//SaveData(ScanLorenz, NT);
 91 | 	
 92 | 	cout << "Test finished!" << endl;
 93 | }
 94 | 
 95 | // AUXILIARY FUNCTION -----------------------------------------------------------------------------
 96 | 
 97 | void Linspace(vector<PRECISION>& x, PRECISION B, PRECISION E, int N)
 98 | {
 99 |     PRECISION Increment;
100 | 	
101 | 	x[0]   = B;
102 | 	
103 | 	if ( N>1 )
104 | 	{
105 | 		x[N-1] = E;
106 | 		Increment = (E-B)/(N-1);
107 | 		
108 | 		for (int i=1; i<N-1; i++)
109 | 		{
110 | 			x[i] = B + i*Increment;
111 | 		}
112 | 	}
113 | }
114 | 
115 | void FillSolverObject(ProblemSolver<NT,SD,NCP,NSP,NISP,NE,NA,NIA,NDO,SOLVER,PRECISION>& Solver, const vector<PRECISION>& R_Values, int NumberOfThreads)
116 | {
117 | 	int ProblemNumber = 0;
118 | 	for (int k=0; k<NumberOfThreads; k++)
119 | 	{
120 | 		Solver.SetHost(ProblemNumber, TimeDomain,  0, 0 );
121 | 		Solver.SetHost(ProblemNumber, TimeDomain,  1, 0.001*1000.0 );
122 | 		
123 | 		Solver.SetHost(ProblemNumber, ActualState, 0, 1.0 );
124 | 		Solver.SetHost(ProblemNumber, ActualState, 1, 0.0 );
125 | 		Solver.SetHost(ProblemNumber, ActualState, 2, 0.0 );
126 | 		
127 | 		Solver.SetHost(ProblemNumber, ControlParameters, 0, R_Values[k] );
128 | 		
129 | 		ProblemNumber++;
130 | 	}
131 | }
132 | 
133 | void SaveData(ProblemSolver<NT,SD,NCP,NSP,NISP,NE,NA,NIA,NDO,SOLVER,PRECISION>& Solver, int NumberOfThreads)
134 | {
135 | 	ofstream DataFile;
136 | 	DataFile.open ( "Lorenz.txt" );
137 | 	
138 | 	int Width = 18;
139 | 	DataFile.precision(10);
140 | 	DataFile.flags(ios::scientific);
141 | 	
142 | 	for (int tid=0; tid<NumberOfThreads; tid++)
143 | 	{
144 | 		DataFile.width(Width); DataFile << Solver.GetHost<PRECISION>(tid, ControlParameters, 0) << ',';
145 | 		DataFile.width(Width); DataFile << Solver.GetHost<PRECISION>(tid, ActualState, 0) << ',';
146 | 		DataFile.width(Width); DataFile << Solver.GetHost<PRECISION>(tid, ActualState, 1) << ',';
147 | 		DataFile.width(Width); DataFile << Solver.GetHost<PRECISION>(tid, ActualState, 2);
148 | 		DataFile << '\n';
149 | 	}
150 | 	
151 | 	DataFile.close();
152 | }
153 | 


--------------------------------------------------------------------------------
/GPU_ODE_PyTorch/environment.yml:
--------------------------------------------------------------------------------
  1 | name: venv_torch
  2 | channels:
  3 |   - nvidia/label/cuda-11.6.0
  4 |   - conda-forge
  5 |   - defaults
  6 |   - anaconda
  7 | dependencies:
  8 |   - _libgcc_mutex=0.1=conda_forge
  9 |   - _openmp_mutex=4.5=2_kmp_llvm
 10 |   - appdirs=1.4.4=pyh9f0ad1d_0
 11 |   - asttokens=2.2.1=pyhd8ed1ab_0
 12 |   - backcall=0.2.0=pyh9f0ad1d_0
 13 |   - backports=1.0=pyhd8ed1ab_3
 14 |   - backports.functools_lru_cache=1.6.4=pyhd8ed1ab_0
 15 |   - blas=1.0=mkl
 16 |   - boost=1.78.0=py310hc4a4660_4
 17 |   - boost-cpp=1.78.0=h75c5d50_1
 18 |   - bzip2=1.0.8=h7f98852_4
 19 |   - ca-certificates=2022.12.7=ha878542_0
 20 |   - certifi=2022.12.7=pyhd8ed1ab_0
 21 |   - comm=0.1.2=pyhd8ed1ab_0
 22 |   - cuda-cccl=11.6.55=hf6102b2_0
 23 |   - cuda-command-line-tools=11.6.0=0
 24 |   - cuda-compiler=11.6.0=0
 25 |   - cuda-cudart=11.6.55=he381448_0
 26 |   - cuda-cudart-dev=11.6.55=h42ad0f4_0
 27 |   - cuda-cuobjdump=11.6.55=h9dd2d0c_0
 28 |   - cuda-cupti=11.6.55=h43c5c43_0
 29 |   - cuda-cuxxfilt=11.6.55=h69de05d_0
 30 |   - cuda-driver-dev=11.6.55=0
 31 |   - cuda-gdb=11.6.55=hff0b7d4_0
 32 |   - cuda-libraries=11.6.0=0
 33 |   - cuda-libraries-dev=11.6.0=0
 34 |   - cuda-memcheck=11.6.55=h0288dce_0
 35 |   - cuda-nsight=11.6.55=0
 36 |   - cuda-nsight-compute=11.6.0=0
 37 |   - cuda-nvcc=11.6.55=h5758ece_0
 38 |   - cuda-nvdisasm=11.6.55=h5556c0d_0
 39 |   - cuda-nvml-dev=11.6.55=haa9ef22_0
 40 |   - cuda-nvprof=11.6.55=h30b2dac_0
 41 |   - cuda-nvprune=11.6.55=h3791f62_0
 42 |   - cuda-nvrtc=11.6.55=hc54fff9_0
 43 |   - cuda-nvrtc-dev=11.6.55=h42ad0f4_0
 44 |   - cuda-nvtx=11.6.55=h99d0529_0
 45 |   - cuda-nvvp=11.6.58=h67ee751_0
 46 |   - cuda-samples=11.6.101=h8efea70_0
 47 |   - cuda-sanitizer-api=11.6.55=h4716e2e_0
 48 |   - cuda-toolkit=11.6.0=0
 49 |   - cuda-tools=11.6.0=0
 50 |   - cuda-visual-tools=11.6.0=0
 51 |   - cudatoolkit=11.8.0=h37601d7_11
 52 |   - debugpy=1.6.6=py310heca2aa9_0
 53 |   - decorator=5.1.1=pyhd8ed1ab_0
 54 |   - executing=1.2.0=pyhd8ed1ab_0
 55 |   - gds-tools=1.2.0.100=0
 56 |   - icu=70.1=h27087fc_0
 57 |   - importlib-metadata=6.0.0=pyha770c72_0
 58 |   - importlib_metadata=6.0.0=hd8ed1ab_0
 59 |   - ipykernel=6.19.2=py310h2f386ee_0
 60 |   - ipython=8.10.0=pyh41d4057_0
 61 |   - jedi=0.18.2=pyhd8ed1ab_0
 62 |   - jupyter_client=8.0.3=pyhd8ed1ab_0
 63 |   - jupyter_core=5.2.0=py310hff52083_0
 64 |   - ld_impl_linux-64=2.40=h41732ed_0
 65 |   - libcublas=11.8.1.74=h1e58c10_0
 66 |   - libcublas-dev=11.8.1.74=h7a51e1f_0
 67 |   - libcufft=10.7.0.55=h563f203_0
 68 |   - libcufft-dev=10.7.0.55=h05eb8d0_0
 69 |   - libcufile=1.2.0.100=0
 70 |   - libcufile-dev=1.2.0.100=0
 71 |   - libcurand=10.2.9.55=h7c349da_0
 72 |   - libcurand-dev=10.2.9.55=hd2e71f0_0
 73 |   - libcusolver=11.3.2.55=hebb49eb_0
 74 |   - libcusparse=11.7.1.55=h9a152cf_0
 75 |   - libffi=3.4.2=h7f98852_5
 76 |   - libgcc-ng=12.2.0=h65d4601_19
 77 |   - libhwloc=2.9.0=hd6dc26d_0
 78 |   - libiconv=1.17=h166bdaf_0
 79 |   - libnpp=11.6.0.55=hdb0c674_0
 80 |   - libnpp-dev=11.6.0.55=h0163868_0
 81 |   - libnsl=2.0.0=h7f98852_0
 82 |   - libnvjpeg=11.6.0.55=h6f17e28_0
 83 |   - libnvjpeg-dev=11.6.0.55=h0163868_0
 84 |   - libsodium=1.0.18=h36c2ea0_1
 85 |   - libsqlite=3.40.0=h753d276_0
 86 |   - libstdcxx-ng=12.2.0=h46fd767_19
 87 |   - libuuid=2.32.1=h7f98852_1000
 88 |   - libxml2=2.10.3=h7463322_0
 89 |   - libzlib=1.2.13=h166bdaf_4
 90 |   - llvm-openmp=15.0.7=h0cdce71_0
 91 |   - mako=1.2.4=pyhd8ed1ab_0
 92 |   - markupsafe=2.1.2=py310h1fa729e_0
 93 |   - matplotlib-inline=0.1.6=pyhd8ed1ab_0
 94 |   - mkl=2021.4.0=h8d4b97c_729
 95 |   - mkl-service=2.4.0=py310ha2c4b55_0
 96 |   - mkl_fft=1.3.1=py310h2b4bcf5_1
 97 |   - mkl_random=1.2.2=py310h00e6091_0
 98 |   - ncurses=6.3=h27087fc_1
 99 |   - nest-asyncio=1.5.6=pyhd8ed1ab_0
100 |   - nsight-compute=2022.1.0.12=0
101 |   - numpy=1.23.4=py310hd5efca6_0
102 |   - numpy-base=1.23.4=py310h8e6c178_0
103 |   - openssl=3.1.0=h0b41bf4_0
104 |   - packaging=23.0=pyhd8ed1ab_0
105 |   - parso=0.8.3=pyhd8ed1ab_0
106 |   - pexpect=4.8.0=pyh1a96a4e_2
107 |   - pickleshare=0.7.5=py_1003
108 |   - pip=22.2.2=py310h06a4308_0
109 |   - platformdirs=3.0.0=pyhd8ed1ab_0
110 |   - prompt-toolkit=3.0.36=pyha770c72_0
111 |   - psutil=5.9.4=py310h5764c6d_0
112 |   - ptyprocess=0.7.0=pyhd3deb0d_0
113 |   - pure_eval=0.2.2=pyhd8ed1ab_0
114 |   - pycuda=2022.2.2=py310h8981878_0
115 |   - pygments=2.14.0=pyhd8ed1ab_0
116 |   - python=3.10.9=he550d4f_0_cpython
117 |   - python-dateutil=2.8.2=pyhd8ed1ab_0
118 |   - python_abi=3.10=3_cp310
119 |   - pytools=2022.1.14=pyhd8ed1ab_0
120 |   - pyzmq=25.0.0=py310h059b190_0
121 |   - readline=8.1.2=h0f457ee_0
122 |   - setuptools=67.4.0=pyhd8ed1ab_0
123 |   - six=1.16.0=pyh6c4a22f_0
124 |   - stack_data=0.6.2=pyhd8ed1ab_0
125 |   - tbb=2021.8.0=hf52228f_0
126 |   - tk=8.6.12=h27826a3_0
127 |   - tornado=6.2=py310h5764c6d_1
128 |   - traitlets=5.9.0=pyhd8ed1ab_0
129 |   - typing-extensions=4.4.0=hd8ed1ab_0
130 |   - typing_extensions=4.4.0=pyha770c72_0
131 |   - tzdata=2022g=h191b570_0
132 |   - wcwidth=0.2.6=pyhd8ed1ab_0
133 |   - wheel=0.38.4=pyhd8ed1ab_0
134 |   - xz=5.2.6=h166bdaf_0
135 |   - zeromq=4.3.4=h9c3ff4c_1
136 |   - zipp=3.15.0=pyhd8ed1ab_0
137 |   - zstd=1.5.2=h3eb15da_6
138 |   - pip:
139 |       - charset-normalizer==3.0.1
140 |       - cmake==3.25.0
141 |       - filelock==3.9.0
142 |       - functorch==1.13.0
143 |       - idna==3.4
144 |       - mpmath==1.2.1
145 |       - networkx==3.0rc1
146 |       - nvidia-cublas-cu11==11.10.3.66
147 |       - nvidia-cuda-nvrtc-cu11==11.7.99
148 |       - nvidia-cuda-runtime-cu11==11.7.99
149 |       - nvidia-cudnn-cu11==8.5.0.96
150 |       - pillow==9.4.0
151 |       - pytorch-triton==2.0.0+0d7e753227
152 |       - requests==2.28.2
153 |       - scipy==1.10.1
154 |       - sympy==1.11.1
155 |       - torch==2.0.0.dev20230202+cu116
156 |       - torchaudio==2.0.0.dev20230201+cu116
157 |       - torchdiffeq==0.2.3
158 |       - torchode==0.1.1.post1
159 |       - torchtyping==0.1.4
160 |       - torchvision==0.15.0.dev20230201+cu116
161 |       - typeguard==2.13.3
162 |       - urllib3==1.26.14
163 | 


--------------------------------------------------------------------------------
/GPU_ODE_Julia/bin/ode_problems/pollu.jl:
--------------------------------------------------------------------------------
  1 | using GPU_ODE_Julia
  2 | using DiffEqGPU, BenchmarkTools, StaticArrays, SimpleDiffEq
  3 | using CUDA
  4 | 
  5 | @show ARGS
  6 | #settings
  7 | 
  8 | numberOfParameters = isinteractive() ? 8388608 : parse(Int64, ARGS[1])
  9 | gpuID = 0
 10 | 
 11 | T = Float64
 12 | 
 13 | const k1 = T(.35e0)
 14 | const k2 = T(.266e2)
 15 | const k3 = T(.123e5)
 16 | const k4 = T(.86e-3)
 17 | const k5 = T(.82e-3)
 18 | const k6 = T(.15e5)
 19 | const k7 = T(.13e-3)
 20 | const k8 = T(.24e5)
 21 | const k9 = T(.165e5)
 22 | const k10 = T(.9e4)
 23 | const k11 = T(.22e-1)
 24 | const k12 = T(.12e5)
 25 | const k13 = T(.188e1)
 26 | const k14 = T(.163e5)
 27 | const k15 = T(.48e7)
 28 | const k16 = T(.35e-3)
 29 | const k17 = T(.175e-1)
 30 | const k18 = T(.1e9)
 31 | const k19 = T(.444e12)
 32 | const k20 = T(.124e4)
 33 | const k21 = T(.21e1)
 34 | const k22 = T(.578e1)
 35 | const k23 = T(.474e-1)
 36 | const k24 = T(.178e4)
 37 | const k25 = T(.312e1)
 38 | 
 39 | function f(dy::AbstractArray{T}, y::AbstractArray{T}, p, t) where {T}
 40 |     r1 = k1 * y[1]
 41 |     r2 = k2 * y[2] * y[4]
 42 |     r3 = k3 * y[5] * y[2]
 43 |     r4 = k4 * y[7]
 44 |     r5 = k5 * y[7]
 45 |     r6 = k6 * y[7] * y[6]
 46 |     r7 = k7 * y[9]
 47 |     r8 = k8 * y[9] * y[6]
 48 |     r9 = k9 * y[11] * y[2]
 49 |     r10 = k10 * y[11] * y[1]
 50 |     r11 = k11 * y[13]
 51 |     r12 = k12 * y[10] * y[2]
 52 |     r13 = k13 * y[14]
 53 |     r14 = k14 * y[1] * y[6]
 54 |     r15 = k15 * y[3]
 55 |     r16 = k16 * y[4]
 56 |     r17 = k17 * y[4]
 57 |     r18 = k18 * y[16]
 58 |     r19 = k19 * y[16]
 59 |     r20 = k20 * y[17] * y[6]
 60 |     r21 = k21 * y[19]
 61 |     r22 = k22 * y[19]
 62 |     r23 = k23 * y[1] * y[4]
 63 |     r24 = k24 * y[19] * y[1]
 64 |     r25 = k25 * y[20]
 65 | 
 66 |     dy[1] = -r1 - r10 - r14 - r23 - r24 +
 67 |             r2 + r3 + r9 + r11 + r12 + r22 + r25
 68 |     dy[2] = -r2 - r3 - r9 - r12 + r1 + r21
 69 |     dy[3] = -r15 + r1 + r17 + r19 + r22
 70 |     dy[4] = -r2 - r16 - r17 - r23 + r15
 71 |     dy[5] = -r3 + r4 + r4 + r6 + r7 + r13 + r20
 72 |     dy[6] = -r6 - r8 - r14 - r20 + r3 + r18 + r18
 73 |     dy[7] = -r4 - r5 - r6 + r13
 74 |     dy[8] = r4 + r5 + r6 + r7
 75 |     dy[9] = -r7 - r8
 76 |     dy[10] = -r12 + r7 + r9
 77 |     dy[11] = -r9 - r10 + r8 + r11
 78 |     dy[12] = r9
 79 |     dy[13] = -r11 + r10
 80 |     dy[14] = -r13 + r12
 81 |     dy[15] = r14
 82 |     dy[16] = -r18 - r19 + r16
 83 |     dy[17] = -r20
 84 |     dy[18] = r20
 85 |     dy[19] = -r21 - r22 - r24 + r23 + r25
 86 |     dy[20] = -r25 + r24
 87 | end
 88 | 
 89 | function fjac(J, y, p, t)
 90 |     J .= 0.0
 91 |     J[1, 1] = -k1 - k10 * y[11] - k14 * y[6] - k23 * y[4] - k24 * y[19]
 92 |     J[1, 11] = -k10 * y[1] + k9 * y[2]
 93 |     J[1, 6] = -k14 * y[1]
 94 |     J[1, 4] = -k23 * y[1] + k2 * y[2]
 95 |     J[1, 19] = -k24 * y[1] + k22
 96 |     J[1, 2] = k2 * y[4] + k9 * y[11] + k3 * y[5] + k12 * y[10]
 97 |     J[1, 13] = k11
 98 |     J[1, 20] = k25
 99 |     J[1, 5] = k3 * y[2]
100 |     J[1, 10] = k12 * y[2]
101 | 
102 |     J[2, 4] = -k2 * y[2]
103 |     J[2, 5] = -k3 * y[2]
104 |     J[2, 11] = -k9 * y[2]
105 |     J[2, 10] = -k12 * y[2]
106 |     J[2, 19] = k21
107 |     J[2, 1] = k1
108 |     J[2, 2] = -k2 * y[4] - k3 * y[5] - k9 * y[11] - k12 * y[10]
109 | 
110 |     J[3, 1] = k1
111 |     J[3, 4] = k17
112 |     J[3, 16] = k19
113 |     J[3, 19] = k22
114 |     J[3, 3] = -k15
115 | 
116 |     J[4, 4] = -k2 * y[2] - k16 - k17 - k23 * y[1]
117 |     J[4, 2] = -k2 * y[4]
118 |     J[4, 1] = -k23 * y[4]
119 |     J[4, 3] = k15
120 | 
121 |     J[5, 5] = -k3 * y[2]
122 |     J[5, 2] = -k3 * y[5]
123 |     J[5, 7] = 2k4 + k6 * y[6]
124 |     J[5, 6] = k6 * y[7] + k20 * y[17]
125 |     J[5, 9] = k7
126 |     J[5, 14] = k13
127 |     J[5, 17] = k20 * y[6]
128 | 
129 |     J[6, 6] = -k6 * y[7] - k8 * y[9] - k14 * y[1] - k20 * y[17]
130 |     J[6, 7] = -k6 * y[6]
131 |     J[6, 9] = -k8 * y[6]
132 |     J[6, 1] = -k14 * y[6]
133 |     J[6, 17] = -k20 * y[6]
134 |     J[6, 2] = k3 * y[5]
135 |     J[6, 5] = k3 * y[2]
136 |     J[6, 16] = 2k18
137 | 
138 |     J[7, 7] = -k4 - k5 - k6 * y[6]
139 |     J[7, 6] = -k6 * y[7]
140 |     J[7, 14] = k13
141 | 
142 |     J[8, 7] = k4 + k5 + k6 * y[6]
143 |     J[8, 6] = k6 * y[7]
144 |     J[8, 9] = k7
145 | 
146 |     J[9, 9] = -k7 - k8 * y[6]
147 |     J[9, 6] = -k8 * y[9]
148 | 
149 |     J[10, 10] = -k12 * y[2]
150 |     J[10, 2] = -k12 * y[10] + k9 * y[11]
151 |     J[10, 9] = k7
152 |     J[10, 11] = k9 * y[2]
153 | 
154 |     J[11, 11] = -k9 * y[2] - k10 * y[1]
155 |     J[11, 2] = -k9 * y[11]
156 |     J[11, 1] = -k10 * y[11]
157 |     J[11, 9] = k8 * y[6]
158 |     J[11, 6] = k8 * y[9]
159 |     J[11, 13] = k11
160 | 
161 |     J[12, 11] = k9 * y[2]
162 |     J[12, 2] = k9 * y[11]
163 | 
164 |     J[13, 13] = -k11
165 |     J[13, 11] = k10 * y[1]
166 |     J[13, 1] = k10 * y[11]
167 | 
168 |     J[14, 14] = -k13
169 |     J[14, 10] = k12 * y[2]
170 |     J[14, 2] = k12 * y[10]
171 | 
172 |     J[15, 1] = k14 * y[6]
173 |     J[15, 6] = k14 * y[1]
174 | 
175 |     J[16, 16] = -k18 - k19
176 |     J[16, 4] = k16
177 | 
178 |     J[17, 17] = -k20 * y[6]
179 |     J[17, 6] = -k20 * y[17]
180 | 
181 |     J[18, 17] = k20 * y[6]
182 |     J[18, 6] = k20 * y[17]
183 | 
184 |     J[19, 19] = -k21 - k22 - k24 * y[1]
185 |     J[19, 1] = -k24 * y[19] + k23 * y[4]
186 |     J[19, 4] = k23 * y[1]
187 |     J[19, 20] = k25
188 | 
189 |     J[20, 20] = -k25
190 |     J[20, 1] = k24 * y[19]
191 |     J[20, 19] = k24 * y[1]
192 | 
193 |     return
194 | end
195 | 
196 | u0 = zeros(20)
197 | u0[2] = 0.2
198 | u0[4] = 0.04
199 | u0[7] = 0.1
200 | u0[8] = 0.3
201 | u0[9] = 0.01
202 | u0[17] = 0.007
203 | oprob = ODEProblem(f, u0, (T(0.0), T(60.0)))
204 | 
205 | prob = make_gpu_compatible(oprob, Val(T))
206 | 
207 | @assert prob.f(prob.u0, prob.p, T(1.0f0)) isa StaticArray{<:Tuple, T}
208 | 
209 | ensembleProb = EnsembleProblem(prob)
210 | 
211 | sol = solve(ensembleProb, GPUTsit5(), EnsembleGPUKernel(), trajectories = 2, dt = 1.0f0)
212 | 
213 | ### Lower level API ####
214 | 
215 | ## Building problems here only
216 | I = 1:numberOfParameters
217 | if ensembleProb.safetycopy
218 |     probs = map(I) do i
219 |         ensembleProb.prob_func(deepcopy(ensembleProb.prob), i, 1)
220 |     end
221 | else
222 |     probs = map(I) do i
223 |         ensembleProb.prob_func(ensembleProb.prob, i, 1)
224 |     end
225 | end
226 | 
227 | ## Make them compatible with CUDA
228 | probs = cu(probs)
229 | 
230 | @info "Solving the problem"
231 | sol = @time CUDA.@sync DiffEqGPU.vectorized_asolve(probs, ensembleProb.prob, GPUTsit5();
232 |                                                    save_everystep = false, dt = 0.001f0)
233 | 
234 | # sol = solve(monteprob, GPUTsit5(), EnsembleGPUKernel(), trajectories = 2, dt = 1.0f0)
235 | 


--------------------------------------------------------------------------------
/GPU_ODE_MPGOS/SourceCodes/SingleSystem_PerThread_ExplicitRungeKutta_Steppers.cuh:
--------------------------------------------------------------------------------
  1 | #ifndef SINGLESYSTEM_PERTHREAD_EXPLICITRUNGEKUTTA_STEPPERS_H
  2 | #define SINGLESYSTEM_PERTHREAD_EXPLICITRUNGEKUTTA_STEPPERS_H
  3 | 
  4 | 
  5 | // RK4 ------------------------------------------------------------------------
  6 | template <int NT, int SD, class Precision>
  7 | __forceinline__ __device__ void PerThread_Stepper_RK4(\
  8 | 			int        tid, \
  9 | 			Precision  r_ActualTime, \
 10 | 			Precision  r_TimeStep, \
 11 | 			Precision* r_ActualState, \
 12 | 			Precision* r_NextState, \
 13 | 			Precision* r_Error, \
 14 | 			int&       r_IsFinite, \
 15 | 			Precision* r_ControlParameters, \
 16 | 			Precision* gs_SharedParameters, \
 17 | 			int*       gs_IntegerSharedParameters, \
 18 | 			Precision* r_Accessories, \
 19 | 			int*       r_IntegerAccessories)
 20 | {
 21 | 	// MEMORY MANAGEMENT ------------------------------------------------------
 22 | 	Precision X[SD];
 23 | 	Precision k1[SD];
 24 | 	
 25 | 	Precision T;
 26 | 	Precision dTp2 = static_cast<Precision>(0.5)     * r_TimeStep;
 27 | 	Precision dTp6 = static_cast<Precision>(1.0/6.0) * r_TimeStep;
 28 | 	
 29 | 	
 30 | 	// K1 ---------------------------------------------------------------------
 31 | 	PerThread_OdeFunction(\
 32 | 		tid, \
 33 | 		NT, \
 34 | 		r_NextState, \
 35 | 		r_ActualState, \
 36 | 		r_ActualTime, \
 37 | 		r_ControlParameters, \
 38 | 		gs_SharedParameters, \
 39 | 		gs_IntegerSharedParameters, \
 40 | 		r_Accessories, \
 41 | 		r_IntegerAccessories);
 42 | 	
 43 | 	
 44 | 	// K2 ---------------------------------------------------------------------
 45 | 	//printf("time: %f\n",r_ActualTime);
 46 | 	T  = r_ActualTime + dTp2;
 47 | 	
 48 | 	#pragma unroll
 49 | 	for (int i=0; i<SD; i++)
 50 | 		X[i] = r_ActualState[i] + r_NextState[i] * dTp2;
 51 | 	
 52 | 	PerThread_OdeFunction(\
 53 | 		tid, \
 54 | 		NT, \
 55 | 		k1, \
 56 | 		X, \
 57 | 		T, \
 58 | 		r_ControlParameters, \
 59 | 		gs_SharedParameters, \
 60 | 		gs_IntegerSharedParameters, \
 61 | 		r_Accessories, \
 62 | 		r_IntegerAccessories);
 63 | 	
 64 | 	
 65 | 	// K3 ---------------------------------------------------------------------
 66 | 	#pragma unroll
 67 | 	for (int i=0; i<SD; i++)
 68 | 	{	
 69 | 		r_NextState[i] = r_NextState[i] + static_cast<Precision>(2.0)*k1[i];
 70 | 		X[i] = r_ActualState[i] + k1[i] * dTp2;
 71 | 	}
 72 | 	
 73 | 	PerThread_OdeFunction(\
 74 | 		tid, \
 75 | 		NT, \
 76 | 		k1, \
 77 | 		X, \
 78 | 		T, \
 79 | 		r_ControlParameters, \
 80 | 		gs_SharedParameters, \
 81 | 		gs_IntegerSharedParameters, \
 82 | 		r_Accessories, \
 83 | 		r_IntegerAccessories);
 84 | 	
 85 | 	
 86 | 	// K4 ---------------------------------------------------------------------
 87 | 	T = r_ActualTime + r_TimeStep;
 88 | 	
 89 | 	#pragma unroll
 90 | 	for (int i=0; i<SD; i++)
 91 | 	{
 92 | 		r_NextState[i] = r_NextState[i] + static_cast<Precision>(2.0)*k1[i];
 93 | 		X[i] = r_ActualState[i] + k1[i] * r_TimeStep;
 94 | 	}
 95 | 	
 96 | 	PerThread_OdeFunction(\
 97 | 		tid, \
 98 | 		NT, \
 99 | 		k1, \
100 | 		X, \
101 | 		T, \
102 | 		r_ControlParameters, \
103 | 		gs_SharedParameters, \
104 | 		gs_IntegerSharedParameters, \
105 | 		r_Accessories, \
106 | 		r_IntegerAccessories);
107 | 	
108 | 	
109 | 	// NEW STATE --------------------------------------------------------------
110 | 	#pragma unroll
111 | 	for (int i=0; i<SD; i++)
112 | 	{
113 | 		r_NextState[i] = r_ActualState[i] + dTp6 * ( r_NextState[i] + k1[i] );
114 | 		
115 | 		if ( isfinite( r_NextState[i] ) == 0 )
116 | 			r_IsFinite = 0;
117 | 	}
118 | }
119 | 
120 | 
121 | // RKCK 45 --------------------------------------------------------------------
122 | template <int NT, int SD, class Precision>
123 | __forceinline__ __device__ void PerThread_Stepper_RKCK45(\
124 | 			int        tid, \
125 | 			Precision  r_ActualTime, \
126 | 			Precision  r_TimeStep, \
127 | 			Precision* r_ActualState, \
128 | 			Precision* r_NextState, \
129 | 			Precision* r_Error, \
130 | 			int&       r_IsFinite, \
131 | 			Precision* r_ControlParameters, \
132 | 			Precision* gs_SharedParameters, \
133 | 			int*       gs_IntegerSharedParameters, \
134 | 			Precision* r_Accessories, \
135 | 			int*       r_IntegerAccessories)
136 | {
137 | 	// MEMORY MANAGEMENT ------------------------------------------------------
138 | 	Precision X[SD];
139 | 	Precision T;
140 | 	
141 | 	Precision k1[SD];
142 | 	Precision k2[SD];
143 | 	Precision k3[SD];
144 | 	Precision k4[SD];
145 | 	Precision k5[SD];
146 | 	Precision k6[SD];
147 | 	
148 | 	
149 | 	// K1 ---------------------------------------------------------------------
150 | 	PerThread_OdeFunction(\
151 | 		tid, \
152 | 		NT, \
153 | 		k1, \
154 | 		r_ActualState, \
155 | 		r_ActualTime, \
156 | 		r_ControlParameters, \
157 | 		gs_SharedParameters, \
158 | 		gs_IntegerSharedParameters, \
159 | 		r_Accessories, \
160 | 		r_IntegerAccessories);
161 | 	
162 | 	
163 | 	// K2 ---------------------------------------------------------------------
164 | 	//printf("time: %f\n",r_ActualTime);
165 | 	T = r_ActualTime + r_TimeStep * static_cast<Precision>(1.0/5.0);
166 | 	
167 | 	#pragma unroll
168 | 	for (int i=0; i<SD; i++)
169 | 		X[i] = r_ActualState[i] + r_TimeStep * ( static_cast<Precision>(1.0/5.0) * k1[i] );
170 | 	
171 | 	PerThread_OdeFunction(\
172 | 		tid, \
173 | 		NT, \
174 | 		k2, \
175 | 		X, \
176 | 		T, \
177 | 		r_ControlParameters, \
178 | 		gs_SharedParameters, \
179 | 		gs_IntegerSharedParameters, \
180 | 		r_Accessories, \
181 | 		r_IntegerAccessories);
182 | 	
183 | 	
184 | 	// K3 ---------------------------------------------------------------------
185 | 	T = r_ActualTime + r_TimeStep * static_cast<Precision>(3.0/10.0);
186 | 	
187 | 	#pragma unroll
188 | 	for (int i=0; i<SD; i++)
189 | 		X[i] = r_ActualState[i] + r_TimeStep * ( static_cast<Precision>(3.0/40.0) * k1[i] + \
190 | 	                                             static_cast<Precision>(9.0/40.0) * k2[i] );
191 | 	
192 | 	PerThread_OdeFunction(tid, \
193 | 		NT, \
194 | 		k3, \
195 | 		X, \
196 | 		T, \
197 | 		r_ControlParameters, \
198 | 		gs_SharedParameters, \
199 | 		gs_IntegerSharedParameters, \
200 | 		r_Accessories, \
201 | 		r_IntegerAccessories);
202 | 	
203 | 	
204 | 	// K4 ---------------------------------------------------------------------
205 | 	T = r_ActualTime + r_TimeStep * static_cast<Precision>(3.0/5.0);
206 | 	
207 | 	#pragma unroll
208 | 	for (int i=0; i<SD; i++)
209 | 		X[i] = r_ActualState[i] + r_TimeStep * ( static_cast<Precision>(3.0/10.0)  * k1[i] + \
210 | 	                                             static_cast<Precision>(-9.0/10.0) * k2[i] + \
211 | 												 static_cast<Precision>(6.0/5.0)   * k3[i] );
212 | 	
213 | 	PerThread_OdeFunction(\
214 | 		tid, \
215 | 		NT, \
216 | 		k4, \
217 | 		X, \
218 | 		T, \
219 | 		r_ControlParameters, \
220 | 		gs_SharedParameters, \
221 | 		gs_IntegerSharedParameters, \
222 | 		r_Accessories, \
223 | 		r_IntegerAccessories);
224 | 	
225 | 	
226 | 	// K5 ---------------------------------------------------------------------
227 | 	T = r_ActualTime + r_TimeStep;
228 | 	
229 | 	#pragma unroll
230 | 	for (int i=0; i<SD; i++)
231 | 		X[i] = r_ActualState[i] + r_TimeStep * ( static_cast<Precision>(-11.0/54.0) * k1[i] + \
232 | 	                                             static_cast<Precision>(5.0/2.0)    * k2[i] + \
233 | 												 static_cast<Precision>(-70.0/27.0) * k3[i] + \
234 | 												 static_cast<Precision>(35.0/27.0)  * k4[i] );
235 | 	
236 | 	PerThread_OdeFunction(\
237 | 		tid, \
238 | 		NT, \
239 | 		k5, \
240 | 		X, \
241 | 		T, \
242 | 		r_ControlParameters, \
243 | 		gs_SharedParameters, \
244 | 		gs_IntegerSharedParameters, \
245 | 		r_Accessories, \
246 | 		r_IntegerAccessories);
247 | 	
248 | 	
249 | 	// K6 ---------------------------------------------------------------------
250 | 	T = r_ActualTime + r_TimeStep * static_cast<Precision>(7.0/8.0);
251 | 	
252 | 	#pragma unroll
253 | 	for (int i=0; i<SD; i++)
254 | 		X[i] = r_ActualState[i] + r_TimeStep * ( static_cast<Precision>(1631.0/55296.0)   * k1[i] + \
255 | 	                                             static_cast<Precision>(175.0/512.0)      * k2[i] + \
256 | 												 static_cast<Precision>(575.0/13824.0)    * k3[i] + \
257 | 												 static_cast<Precision>(44275.0/110592.0) * k4[i] + \
258 | 												 static_cast<Precision>(253.0/4096.0)     * k5[i] );
259 | 	
260 | 	PerThread_OdeFunction(\
261 | 		tid, \
262 | 		NT, \
263 | 		k6, \
264 | 		X, \
265 | 		T, \
266 | 		r_ControlParameters, \
267 | 		gs_SharedParameters, \
268 | 		gs_IntegerSharedParameters, \
269 | 		r_Accessories, \
270 | 		r_IntegerAccessories);
271 | 	
272 | 	
273 | 	// NEW STATE AND ERROR ----------------------------------------------------
274 | 	#pragma unroll
275 | 	for (int i=0; i<SD; i++)
276 | 	{
277 | 		r_NextState[i] = r_ActualState[i] + r_TimeStep * ( static_cast<Precision>(37.0/378.0)   * k1[i] + \
278 | 		                                                   static_cast<Precision>(250.0/621.0)  * k3[i] + \
279 | 														   static_cast<Precision>(125.0/594.0)  * k4[i] + \
280 | 														   static_cast<Precision>(512.0/1771.0) * k6[i] );
281 | 		
282 | 		r_Error[i] = static_cast<Precision>(  37.0/378.0  -  2825.0/27648.0 ) * k1[i] + \
283 | 		             static_cast<Precision>( 250.0/621.0  - 18575.0/48384.0 ) * k3[i] + \
284 | 					 static_cast<Precision>( 125.0/594.0  - 13525.0/55296.0 ) * k4[i] + \
285 | 					 static_cast<Precision>(   0.0        -   277.0/14336.0 ) * k5[i] + \
286 | 					 static_cast<Precision>( 512.0/1771.0 -     1.0/4.0     ) * k6[i];
287 | 		r_Error[i] = r_TimeStep * abs( r_Error[i] ) + 1e-18;
288 | 		
289 | 		if ( ( isfinite( r_NextState[i] ) == 0 ) || ( isfinite( r_Error[i] ) == 0 ) )
290 | 			r_IsFinite = 0;
291 | 	}
292 | }
293 | 
294 | #endif


--------------------------------------------------------------------------------
/GPU_ODE_MPGOS/SourceCodes/CoupledSystems_PerBlock_EventHandling.cuh:
--------------------------------------------------------------------------------
  1 | #ifndef COUPLEDSYSTEMS_PERBLOCK_EVENTHANDLING_H
  2 | #define COUPLEDSYSTEMS_PERBLOCK_EVENTHANDLING_H
  3 | 
  4 | 
  5 | template <int NBL, int UPS, int SPB, int NE, class Precision>
  6 | __forceinline__ __device__ void CoupledSystems_PerBlock_MultipleSystems_MultipleBlockLaunches_EventTimeStepControl(\
  7 | 			Precision* s_TimeStep, \
  8 | 			Precision* s_NewTimeStep, \
  9 | 			int*       s_TerminateSystemScope, \
 10 | 			int*       s_UpdateStep, \
 11 | 			Precision  r_ActualEventValue[(NE==0?1:NBL)][(NE==0?1:NE)], \
 12 | 			Precision  r_NextEventValue[(NE==0?1:NBL)][(NE==0?1:NE)], \
 13 | 			Precision* s_EventTolerance, \
 14 | 			int*       s_EventDirection, \
 15 | 			Precision  MinimumTimeStep)
 16 | {
 17 | 	int LocalThreadID_GPU = threadIdx.x;
 18 | 	int BlockID           = blockIdx.x;
 19 | 	int LocalThreadID_Logical;
 20 | 	int LocalSystemID;
 21 | 	int GlobalSystemID;
 22 | 	
 23 | 	__shared__ Precision s_EventTimeStep[SPB];
 24 | 	__shared__ int       s_IsCorrected[SPB];
 25 | 	
 26 | 	// Event time step initialisation
 27 | 	int Launches = SPB / blockDim.x + (SPB % blockDim.x == 0 ? 0 : 1);
 28 | 	for (int j=0; j<Launches; j++)
 29 | 	{
 30 | 		int lsid = threadIdx.x + j*blockDim.x;
 31 | 		
 32 | 		if ( lsid < SPB )
 33 | 		{
 34 | 			s_EventTimeStep[lsid] = s_TimeStep[lsid];
 35 | 			s_IsCorrected[lsid]   = 0;
 36 | 		}
 37 | 	}
 38 | 	__syncthreads();
 39 | 	
 40 | 	// Event time step correction
 41 | 	for (int BL=0; BL<NBL; BL++)
 42 | 	{
 43 | 		LocalThreadID_Logical = LocalThreadID_GPU + BL*blockDim.x;
 44 | 		LocalSystemID         = LocalThreadID_Logical / UPS;
 45 | 		
 46 | 		if ( ( LocalSystemID < SPB ) && ( s_UpdateStep[LocalSystemID] == 1 ) && ( s_TerminateSystemScope[LocalSystemID] == 0 ) )
 47 | 		{
 48 | 			for (int i=0; i<NE; i++)
 49 | 			{
 50 | 				if ( ( ( r_ActualEventValue[BL][i] >  s_EventTolerance[i] ) && ( r_NextEventValue[BL][i] < -s_EventTolerance[i] ) && ( s_EventDirection[i] <= 0 ) ) || \
 51 | 				     ( ( r_ActualEventValue[BL][i] < -s_EventTolerance[i] ) && ( r_NextEventValue[BL][i] >  s_EventTolerance[i] ) && ( s_EventDirection[i] >= 0 ) ) )
 52 | 				{
 53 | 					MPGOS::atomicFMIN(&(s_EventTimeStep[LocalSystemID]), -r_ActualEventValue[BL][i] / (r_NextEventValue[BL][i]-r_ActualEventValue[BL][i]) * s_TimeStep[LocalSystemID]);
 54 | 					atomicMax(&(s_IsCorrected[LocalSystemID]), 1);
 55 | 				}
 56 | 			}
 57 | 		}
 58 | 	}
 59 | 	__syncthreads();
 60 | 	
 61 | 	// Corrected time step and modified update
 62 | 	for (int j=0; j<Launches; j++)
 63 | 	{
 64 | 		int lsid = threadIdx.x + j*blockDim.x;
 65 | 		int gsid = lsid + blockIdx.x*SPB;
 66 | 		
 67 | 		if ( ( lsid < SPB ) && ( s_IsCorrected[lsid] == 1 ) )
 68 | 		{
 69 | 			if ( s_EventTimeStep[lsid] < MinimumTimeStep )
 70 | 			{
 71 | 				printf("Warning: Event cannot be detected without reducing the step size below the minimum! Event detection omitted!, (global system id: %d)\n", gsid);
 72 | 			} else
 73 | 			{
 74 | 				s_NewTimeStep[lsid] = s_EventTimeStep[lsid];
 75 | 				s_UpdateStep[lsid]  = 0;
 76 | 			}
 77 | 		}
 78 | 	}
 79 | 	__syncthreads();
 80 | }
 81 | 
 82 | 
 83 | template <int NBL, int UPS, int NE, class Precision>
 84 | __forceinline__ __device__ void CoupledSystems_PerBlock_SingleSystem_MultipleBlockLaunches_EventTimeStepControl(\
 85 | 			Precision  s_TimeStep, \
 86 | 			Precision& s_NewTimeStep, \
 87 | 			int        s_TerminateSystemScope, \
 88 | 			int&       s_UpdateStep, \
 89 | 			Precision  r_ActualEventValue[(NE==0?1:NBL)][(NE==0?1:NE)], \
 90 | 			Precision  r_NextEventValue[(NE==0?1:NBL)][(NE==0?1:NE)], \
 91 | 			Precision* s_EventTolerance, \
 92 | 			int*       s_EventDirection, \
 93 | 			Precision  MinimumTimeStep)
 94 | {
 95 | 	const int LocalThreadID_GPU = threadIdx.x;
 96 | 	const int GlobalSystemID    = blockIdx.x;
 97 | 	
 98 | 	int UnitID;
 99 | 	
100 | 	
101 | 	__shared__ Precision s_EventTimeStep;
102 | 	__shared__ int       s_IsCorrected;
103 | 	
104 | 	// Event time step initialisation
105 | 	if ( threadIdx.x == 0 )
106 | 	{
107 | 		s_EventTimeStep = s_TimeStep;
108 | 		s_IsCorrected   = 0;
109 | 	}
110 | 	__syncthreads();
111 | 	
112 | 	// Event time step correction
113 | 	for (int BL=0; BL<NBL; BL++)
114 | 	{
115 | 		UnitID = LocalThreadID_GPU + BL*blockDim.x;
116 | 		
117 | 		if ( ( UnitID < UPS ) && ( s_UpdateStep == 1 ) && ( s_TerminateSystemScope == 0 ) )
118 | 		{
119 | 			for (int i=0; i<NE; i++)
120 | 			{
121 | 				if ( ( ( r_ActualEventValue[BL][i] >  s_EventTolerance[i] ) && ( r_NextEventValue[BL][i] < -s_EventTolerance[i] ) && ( s_EventDirection[i] <= 0 ) ) || \
122 | 				     ( ( r_ActualEventValue[BL][i] < -s_EventTolerance[i] ) && ( r_NextEventValue[BL][i] >  s_EventTolerance[i] ) && ( s_EventDirection[i] >= 0 ) ) )
123 | 				{
124 | 					MPGOS::atomicFMIN(&s_EventTimeStep, -r_ActualEventValue[BL][i] / (r_NextEventValue[BL][i]-r_ActualEventValue[BL][i]) * s_TimeStep);
125 | 					atomicMax(&s_IsCorrected, 1);
126 | 				}
127 | 			}
128 | 		}
129 | 	}
130 | 	__syncthreads();
131 | 	
132 | 	// Corrected time step and modified update
133 | 	if ( ( threadIdx.x == 0 ) && ( s_IsCorrected == 1 ) )
134 | 	{
135 | 		if ( s_EventTimeStep < MinimumTimeStep )
136 | 		{
137 | 			printf("Warning: Event cannot be detected without reducing the step size below the minimum! Event detection omitted!, (global system id: %d)\n", GlobalSystemID);
138 | 		} else
139 | 		{
140 | 			s_NewTimeStep = s_EventTimeStep;
141 | 			s_UpdateStep  = 0;
142 | 		}
143 | 	}
144 | 	__syncthreads();
145 | }
146 | 
147 | 
148 | template <int UPS, int SPB, int NE, class Precision>
149 | __forceinline__ __device__ void CoupledSystems_PerBlock_MultipleSystems_SingleBlockLaunch_EventTimeStepControl(\
150 | 			const int  LocalSystemID, \
151 | 			Precision* s_TimeStep, \
152 | 			Precision* s_NewTimeStep, \
153 | 			int*       s_TerminateSystemScope, \
154 | 			int*       s_UpdateStep, \
155 | 			Precision  r_ActualEventValue[(NE==0?1:NE)], \
156 | 			Precision  r_NextEventValue[(NE==0?1:NE)], \
157 | 			Precision* s_EventTolerance, \
158 | 			int*       s_EventDirection, \
159 | 			Precision  MinimumTimeStep)
160 | {
161 | 	__shared__ Precision s_EventTimeStep[SPB];
162 | 	__shared__ int       s_IsCorrected[SPB];
163 | 	
164 | 	// Event time step initialisation
165 | 	int Launches = SPB / blockDim.x + (SPB % blockDim.x == 0 ? 0 : 1);
166 | 	for (int j=0; j<Launches; j++)
167 | 	{
168 | 		int lsid = threadIdx.x + j*blockDim.x;
169 | 		
170 | 		if ( lsid < SPB )
171 | 		{
172 | 			s_EventTimeStep[lsid] = s_TimeStep[lsid];
173 | 			s_IsCorrected[lsid]   = 0;
174 | 		}
175 | 	}
176 | 	__syncthreads();
177 | 	
178 | 	// Event time step correction
179 | 	if ( ( LocalSystemID < SPB ) && ( s_UpdateStep[LocalSystemID] == 1 ) && ( s_TerminateSystemScope[LocalSystemID] == 0 ) )
180 | 	{
181 | 		for (int i=0; i<NE; i++)
182 | 		{
183 | 			if ( ( ( r_ActualEventValue[i] >  s_EventTolerance[i] ) && ( r_NextEventValue[i] < -s_EventTolerance[i] ) && ( s_EventDirection[i] <= 0 ) ) || \
184 | 			     ( ( r_ActualEventValue[i] < -s_EventTolerance[i] ) && ( r_NextEventValue[i] >  s_EventTolerance[i] ) && ( s_EventDirection[i] >= 0 ) ) )
185 | 			{
186 | 				MPGOS::atomicFMIN(&(s_EventTimeStep[LocalSystemID]), -r_ActualEventValue[i] / (r_NextEventValue[i]-r_ActualEventValue[i]) * s_TimeStep[LocalSystemID]);
187 | 				atomicMax(&(s_IsCorrected[LocalSystemID]), 1);
188 | 			}
189 | 		}
190 | 	}
191 | 	__syncthreads();
192 | 	
193 | 	// Corrected time step and modified update
194 | 	for (int j=0; j<Launches; j++)
195 | 	{
196 | 		int lsid = threadIdx.x + j*blockDim.x;
197 | 		int gsid = lsid + blockIdx.x*SPB;
198 | 		
199 | 		if ( ( lsid < SPB ) && ( s_IsCorrected[lsid] == 1 ) )
200 | 		{
201 | 			if ( s_EventTimeStep[lsid] < MinimumTimeStep )
202 | 			{
203 | 				printf("Warning: Event cannot be detected without reducing the step size below the minimum! Event detection omitted!, (global system id: %d)\n", gsid);
204 | 			} else
205 | 			{
206 | 				s_NewTimeStep[lsid] = s_EventTimeStep[lsid];
207 | 				s_UpdateStep[lsid]  = 0;
208 | 			}
209 | 		}
210 | 	}
211 | 	__syncthreads();
212 | }
213 | 
214 | 
215 | template <int UPS, int NE, class Precision>
216 | __forceinline__ __device__ void CoupledSystems_PerBlock_SingleSystem_SingleBlockLaunch_EventTimeStepControl(\
217 | 			const int  GlobalSystemID, \
218 | 			const int  LocalThreadID, \
219 | 			Precision  s_TimeStep, \
220 | 			Precision& s_NewTimeStep, \
221 | 			int        s_TerminateSystemScope, \
222 | 			int&       s_UpdateStep, \
223 | 			Precision  r_ActualEventValue[(NE==0?1:NE)], \
224 | 			Precision  r_NextEventValue[(NE==0?1:NE)], \
225 | 			Precision* s_EventTolerance, \
226 | 			int*       s_EventDirection, \
227 | 			Precision  MinimumTimeStep)
228 | {
229 | 	__shared__ Precision s_EventTimeStep;
230 | 	__shared__ int       s_IsCorrected;
231 | 	
232 | 	// Event time step initialisation
233 | 	if ( threadIdx.x == 0 )
234 | 	{
235 | 		s_EventTimeStep = s_TimeStep;
236 | 		s_IsCorrected   = 0;
237 | 	}
238 | 	__syncthreads();
239 | 	
240 | 	// Event time step correction
241 | 	if ( ( LocalThreadID < UPS ) && ( s_UpdateStep == 1 ) && ( s_TerminateSystemScope == 0 ) )
242 | 	{
243 | 		for (int i=0; i<NE; i++)
244 | 		{
245 | 			if ( ( ( r_ActualEventValue[i] >  s_EventTolerance[i] ) && ( r_NextEventValue[i] < -s_EventTolerance[i] ) && ( s_EventDirection[i] <= 0 ) ) || \
246 | 			     ( ( r_ActualEventValue[i] < -s_EventTolerance[i] ) && ( r_NextEventValue[i] >  s_EventTolerance[i] ) && ( s_EventDirection[i] >= 0 ) ) )
247 | 			{
248 | 				MPGOS::atomicFMIN(&s_EventTimeStep, -r_ActualEventValue[i] / (r_NextEventValue[i]-r_ActualEventValue[i]) * s_TimeStep);
249 | 				atomicMax(&s_IsCorrected, 1);
250 | 			}
251 | 		}
252 | 	}
253 | 	__syncthreads();
254 | 	
255 | 	// Corrected time step and modified update
256 | 	if ( ( threadIdx.x == 0 ) && ( s_IsCorrected == 1 ) )
257 | 	{
258 | 		if ( s_EventTimeStep < MinimumTimeStep )
259 | 		{
260 | 			printf("Warning: Event cannot be detected without reducing the step size below the minimum! Event detection omitted!, (global system id: %d)\n", GlobalSystemID);
261 | 		} else
262 | 		{
263 | 			s_NewTimeStep = s_EventTimeStep;
264 | 			s_UpdateStep  = 0;
265 | 		}
266 | 	}
267 | 	__syncthreads();
268 | }
269 | 
270 | #endif


--------------------------------------------------------------------------------
/GPU_ODE_Julia/bin/catalyst_models/multisite2.net:
--------------------------------------------------------------------------------
  1 | # Created by BioNetGen 2.7.0
  2 | begin parameters
  3 |     1 Rtot   5360  # Constant
  4 |     2 Ltot   1160  # Constant
  5 |     3 Atot   5360  # Constant
  6 |     4 kOnR   0.01  # Constant
  7 |     5 kOffR  0.1  # Constant
  8 |     6 kCatR  5.0  # Constant
  9 |     7 kOnL   0.01  # Constant
 10 |     8 kOffL  0.1  # Constant
 11 |     9 kCatL  5.1  # Constant
 12 | end parameters
 13 | begin species
 14 |     1 R(a) Rtot
 15 |     2 L(a) Ltot
 16 |     3 A(p1~U,p2~U,p3~U) Atot
 17 |     4 A(p1~U!1,p2~U,p3~U).R(a!1) 0
 18 |     5 A(p1~U,p2~U!1,p3~U).R(a!1) 0
 19 |     6 A(p1~U,p2~U,p3~U!1).R(a!1) 0
 20 |     7 A(p1~U!1,p2~U!2,p3~U).R(a!1).R(a!2) 0
 21 |     8 A(p1~U!1,p2~U,p3~U!2).R(a!1).R(a!2) 0
 22 |     9 A(p1~P,p2~U,p3~U) 0
 23 |    10 A(p1~U,p2~U!1,p3~U!2).R(a!1).R(a!2) 0
 24 |    11 A(p1~U,p2~P,p3~U) 0
 25 |    12 A(p1~U,p2~U,p3~P) 0
 26 |    13 A(p1~U!1,p2~U!2,p3~U!3).R(a!1).R(a!2).R(a!3) 0
 27 |    14 A(p1~U!1,p2~P,p3~U).R(a!1) 0
 28 |    15 A(p1~U!1,p2~U,p3~P).R(a!1) 0
 29 |    16 A(p1~P,p2~U!1,p3~U).R(a!1) 0
 30 |    17 A(p1~P,p2~U,p3~U!1).R(a!1) 0
 31 |    18 A(p1~P!1,p2~U,p3~U).L(a!1) 0
 32 |    19 A(p1~U,p2~U!1,p3~P).R(a!1) 0
 33 |    20 A(p1~U,p2~P,p3~U!1).R(a!1) 0
 34 |    21 A(p1~U,p2~P!1,p3~U).L(a!1) 0
 35 |    22 A(p1~U,p2~U,p3~P!1).L(a!1) 0
 36 |    23 A(p1~U!1,p2~U!2,p3~P).R(a!1).R(a!2) 0
 37 |    24 A(p1~U!1,p2~P,p3~U!2).R(a!1).R(a!2) 0
 38 |    25 A(p1~U!1,p2~P!2,p3~U).L(a!2).R(a!1) 0
 39 |    26 A(p1~U!1,p2~U,p3~P!2).L(a!2).R(a!1) 0
 40 |    27 A(p1~P,p2~U!1,p3~U!2).R(a!1).R(a!2) 0
 41 |    28 A(p1~P,p2~P,p3~U) 0
 42 |    29 A(p1~P,p2~U,p3~P) 0
 43 |    30 A(p1~P!1,p2~U!2,p3~U).L(a!1).R(a!2) 0
 44 |    31 A(p1~P!1,p2~U,p3~U!2).L(a!1).R(a!2) 0
 45 |    32 A(p1~U,p2~U!1,p3~P!2).L(a!2).R(a!1) 0
 46 |    33 A(p1~U,p2~P,p3~P) 0
 47 |    34 A(p1~U,p2~P!1,p3~U!2).L(a!1).R(a!2) 0
 48 |    35 A(p1~U!1,p2~U!2,p3~P!3).L(a!3).R(a!1).R(a!2) 0
 49 |    36 A(p1~U!1,p2~P,p3~P).R(a!1) 0
 50 |    37 A(p1~U!1,p2~P!2,p3~U!3).L(a!2).R(a!1).R(a!3) 0
 51 |    38 A(p1~P,p2~U!1,p3~P).R(a!1) 0
 52 |    39 A(p1~P,p2~P,p3~U!1).R(a!1) 0
 53 |    40 A(p1~P,p2~P!1,p3~U).L(a!1) 0
 54 |    41 A(p1~P,p2~U,p3~P!1).L(a!1) 0
 55 |    42 A(p1~P!1,p2~U!2,p3~U!3).L(a!1).R(a!2).R(a!3) 0
 56 |    43 A(p1~P!1,p2~P,p3~U).L(a!1) 0
 57 |    44 A(p1~P!1,p2~U,p3~P).L(a!1) 0
 58 |    45 A(p1~U,p2~P,p3~P!1).L(a!1) 0
 59 |    46 A(p1~U,p2~P!1,p3~P).L(a!1) 0
 60 |    47 A(p1~U!1,p2~P,p3~P!2).L(a!2).R(a!1) 0
 61 |    48 A(p1~U!1,p2~P!2,p3~P).L(a!2).R(a!1) 0
 62 |    49 A(p1~P,p2~U!1,p3~P!2).L(a!2).R(a!1) 0
 63 |    50 A(p1~P,p2~P,p3~P) 0
 64 |    51 A(p1~P,p2~P!1,p3~U!2).L(a!1).R(a!2) 0
 65 |    52 A(p1~P!1,p2~U!2,p3~P).L(a!1).R(a!2) 0
 66 |    53 A(p1~P!1,p2~P,p3~U!2).L(a!1).R(a!2) 0
 67 |    54 A(p1~P!1,p2~P!2,p3~U).L(a!1).L(a!2) 0
 68 |    55 A(p1~P!1,p2~U,p3~P!2).L(a!1).L(a!2) 0
 69 |    56 A(p1~U,p2~P!1,p3~P!2).L(a!1).L(a!2) 0
 70 |    57 A(p1~U!1,p2~P!2,p3~P!3).L(a!2).L(a!3).R(a!1) 0
 71 |    58 A(p1~P,p2~P,p3~P!1).L(a!1) 0
 72 |    59 A(p1~P,p2~P!1,p3~P).L(a!1) 0
 73 |    60 A(p1~P!1,p2~U!2,p3~P!3).L(a!1).L(a!3).R(a!2) 0
 74 |    61 A(p1~P!1,p2~P,p3~P).L(a!1) 0
 75 |    62 A(p1~P!1,p2~P!2,p3~U!3).L(a!1).L(a!2).R(a!3) 0
 76 |    63 A(p1~P,p2~P!1,p3~P!2).L(a!1).L(a!2) 0
 77 |    64 A(p1~P!1,p2~P,p3~P!2).L(a!1).L(a!2) 0
 78 |    65 A(p1~P!1,p2~P!2,p3~P).L(a!1).L(a!2) 0
 79 |    66 A(p1~P!1,p2~P!2,p3~P!3).L(a!1).L(a!2).L(a!3) 0
 80 | end species
 81 | begin reactions
 82 |     1 1,3 4 kOnR #_R1
 83 |     2 1,3 5 kOnR #_R5
 84 |     3 1,3 6 kOnR #_R9
 85 |     4 1,5 7 kOnR #_R1
 86 |     5 1,6 8 kOnR #_R1
 87 |     6 4 1,3 kOffR #_reverse__R1
 88 |     7 4 1,9 kCatR #_R2
 89 |     8 1,4 7 kOnR #_R5
 90 |     9 1,6 10 kOnR #_R5
 91 |    10 5 1,3 kOffR #_reverse__R5
 92 |    11 5 1,11 kCatR #_R6
 93 |    12 1,4 8 kOnR #_R9
 94 |    13 1,5 10 kOnR #_R9
 95 |    14 6 1,3 kOffR #_reverse__R9
 96 |    15 6 1,12 kCatR #_R10
 97 |    16 1,10 13 kOnR #_R1
 98 |    17 1,11 14 kOnR #_R1
 99 |    18 1,12 15 kOnR #_R1
100 |    19 7 1,5 kOffR #_reverse__R1
101 |    20 8 1,6 kOffR #_reverse__R1
102 |    21 7 1,16 kCatR #_R2
103 |    22 8 1,17 kCatR #_R2
104 |    23 2,9 18 kOnL #_R3
105 |    24 1,8 13 kOnR #_R5
106 |    25 1,9 16 kOnR #_R5
107 |    26 1,12 19 kOnR #_R5
108 |    27 7 1,4 kOffR #_reverse__R5
109 |    28 10 1,6 kOffR #_reverse__R5
110 |    29 7 1,14 kCatR #_R6
111 |    30 10 1,20 kCatR #_R6
112 |    31 2,11 21 kOnL #_R7
113 |    32 1,7 13 kOnR #_R9
114 |    33 1,9 17 kOnR #_R9
115 |    34 1,11 20 kOnR #_R9
116 |    35 8 1,4 kOffR #_reverse__R9
117 |    36 10 1,5 kOffR #_reverse__R9
118 |    37 8 1,15 kCatR #_R10
119 |    38 10 1,19 kCatR #_R10
120 |    39 2,12 22 kOnL #_R11
121 |    40 1,19 23 kOnR #_R1
122 |    41 1,20 24 kOnR #_R1
123 |    42 1,21 25 kOnR #_R1
124 |    43 1,22 26 kOnR #_R1
125 |    44 13 1,10 kOffR #_reverse__R1
126 |    45 14 1,11 kOffR #_reverse__R1
127 |    46 15 1,12 kOffR #_reverse__R1
128 |    47 13 1,27 kCatR #_R2
129 |    48 14 1,28 kCatR #_R2
130 |    49 15 1,29 kCatR #_R2
131 |    50 2,16 30 kOnL #_R3
132 |    51 2,17 31 kOnL #_R3
133 |    52 18 2,9 kOffL #_reverse__R3
134 |    53 18 2,3 kCatL #_R4
135 |    54 1,15 23 kOnR #_R5
136 |    55 1,17 27 kOnR #_R5
137 |    56 1,18 30 kOnR #_R5
138 |    57 1,22 32 kOnR #_R5
139 |    58 13 1,8 kOffR #_reverse__R5
140 |    59 16 1,9 kOffR #_reverse__R5
141 |    60 19 1,12 kOffR #_reverse__R5
142 |    61 13 1,24 kCatR #_R6
143 |    62 16 1,28 kCatR #_R6
144 |    63 19 1,33 kCatR #_R6
145 |    64 2,14 25 kOnL #_R7
146 |    65 2,20 34 kOnL #_R7
147 |    66 21 2,11 kOffL #_reverse__R7
148 |    67 21 2,3 kCatL #_R8
149 |    68 1,14 24 kOnR #_R9
150 |    69 1,16 27 kOnR #_R9
151 |    70 1,18 31 kOnR #_R9
152 |    71 1,21 34 kOnR #_R9
153 |    72 13 1,7 kOffR #_reverse__R9
154 |    73 17 1,9 kOffR #_reverse__R9
155 |    74 20 1,11 kOffR #_reverse__R9
156 |    75 13 1,23 kCatR #_R10
157 |    76 17 1,29 kCatR #_R10
158 |    77 20 1,33 kCatR #_R10
159 |    78 2,15 26 kOnL #_R11
160 |    79 2,19 32 kOnL #_R11
161 |    80 22 2,12 kOffL #_reverse__R11
162 |    81 22 2,3 kCatL #_R12
163 |    82 1,32 35 kOnR #_R1
164 |    83 1,33 36 kOnR #_R1
165 |    84 1,34 37 kOnR #_R1
166 |    85 23 1,19 kOffR #_reverse__R1
167 |    86 24 1,20 kOffR #_reverse__R1
168 |    87 25 1,21 kOffR #_reverse__R1
169 |    88 26 1,22 kOffR #_reverse__R1
170 |    89 23 1,38 kCatR #_R2
171 |    90 24 1,39 kCatR #_R2
172 |    91 25 1,40 kCatR #_R2
173 |    92 26 1,41 kCatR #_R2
174 |    93 2,27 42 kOnL #_R3
175 |    94 2,28 43 kOnL #_R3
176 |    95 2,29 44 kOnL #_R3
177 |    96 30 2,16 kOffL #_reverse__R3
178 |    97 31 2,17 kOffL #_reverse__R3
179 |    98 30 2,5 kCatL #_R4
180 |    99 31 2,6 kCatL #_R4
181 |   100 1,26 35 kOnR #_R5
182 |   101 1,29 38 kOnR #_R5
183 |   102 1,31 42 kOnR #_R5
184 |   103 23 1,15 kOffR #_reverse__R5
185 |   104 27 1,17 kOffR #_reverse__R5
186 |   105 30 1,18 kOffR #_reverse__R5
187 |   106 32 1,22 kOffR #_reverse__R5
188 |   107 23 1,36 kCatR #_R6
189 |   108 27 1,39 kCatR #_R6
190 |   109 30 1,43 kCatR #_R6
191 |   110 32 1,45 kCatR #_R6
192 |   111 2,24 37 kOnL #_R7
193 |   112 2,28 40 kOnL #_R7
194 |   113 2,33 46 kOnL #_R7
195 |   114 25 2,14 kOffL #_reverse__R7
196 |   115 34 2,20 kOffL #_reverse__R7
197 |   116 25 2,4 kCatL #_R8
198 |   117 34 2,6 kCatL #_R8
199 |   118 1,25 37 kOnR #_R9
200 |   119 1,28 39 kOnR #_R9
201 |   120 1,30 42 kOnR #_R9
202 |   121 24 1,14 kOffR #_reverse__R9
203 |   122 27 1,16 kOffR #_reverse__R9
204 |   123 31 1,18 kOffR #_reverse__R9
205 |   124 34 1,21 kOffR #_reverse__R9
206 |   125 24 1,36 kCatR #_R10
207 |   126 27 1,38 kCatR #_R10
208 |   127 31 1,44 kCatR #_R10
209 |   128 34 1,46 kCatR #_R10
210 |   129 2,23 35 kOnL #_R11
211 |   130 2,29 41 kOnL #_R11
212 |   131 2,33 45 kOnL #_R11
213 |   132 26 2,15 kOffL #_reverse__R11
214 |   133 32 2,19 kOffL #_reverse__R11
215 |   134 26 2,4 kCatL #_R12
216 |   135 32 2,5 kCatL #_R12
217 |   136 1,45 47 kOnR #_R1
218 |   137 1,46 48 kOnR #_R1
219 |   138 35 1,32 kOffR #_reverse__R1
220 |   139 36 1,33 kOffR #_reverse__R1
221 |   140 37 1,34 kOffR #_reverse__R1
222 |   141 35 1,49 kCatR #_R2
223 |   142 36 1,50 kCatR #_R2
224 |   143 37 1,51 kCatR #_R2
225 |   144 2,38 52 kOnL #_R3
226 |   145 2,39 53 kOnL #_R3
227 |   146 2,40 54 kOnL #_R3
228 |   147 2,41 55 kOnL #_R3
229 |   148 42 2,27 kOffL #_reverse__R3
230 |   149 43 2,28 kOffL #_reverse__R3
231 |   150 44 2,29 kOffL #_reverse__R3
232 |   151 42 2,10 kCatL #_R4
233 |   152 43 2,11 kCatL #_R4
234 |   153 44 2,12 kCatL #_R4
235 |   154 1,41 49 kOnR #_R5
236 |   155 1,44 52 kOnR #_R5
237 |   156 35 1,26 kOffR #_reverse__R5
238 |   157 38 1,29 kOffR #_reverse__R5
239 |   158 42 1,31 kOffR #_reverse__R5
240 |   159 35 1,47 kCatR #_R6
241 |   160 38 1,50 kCatR #_R6
242 |   161 42 1,53 kCatR #_R6
243 |   162 2,36 48 kOnL #_R7
244 |   163 2,39 51 kOnL #_R7
245 |   164 2,43 54 kOnL #_R7
246 |   165 2,45 56 kOnL #_R7
247 |   166 37 2,24 kOffL #_reverse__R7
248 |   167 40 2,28 kOffL #_reverse__R7
249 |   168 46 2,33 kOffL #_reverse__R7
250 |   169 37 2,8 kCatL #_R8
251 |   170 40 2,9 kCatL #_R8
252 |   171 46 2,12 kCatL #_R8
253 |   172 1,40 51 kOnR #_R9
254 |   173 1,43 53 kOnR #_R9
255 |   174 37 1,25 kOffR #_reverse__R9
256 |   175 39 1,28 kOffR #_reverse__R9
257 |   176 42 1,30 kOffR #_reverse__R9
258 |   177 37 1,48 kCatR #_R10
259 |   178 39 1,50 kCatR #_R10
260 |   179 42 1,52 kCatR #_R10
261 |   180 2,36 47 kOnL #_R11
262 |   181 2,38 49 kOnL #_R11
263 |   182 2,44 55 kOnL #_R11
264 |   183 2,46 56 kOnL #_R11
265 |   184 35 2,23 kOffL #_reverse__R11
266 |   185 41 2,29 kOffL #_reverse__R11
267 |   186 45 2,33 kOffL #_reverse__R11
268 |   187 35 2,7 kCatL #_R12
269 |   188 41 2,9 kCatL #_R12
270 |   189 45 2,11 kCatL #_R12
271 |   190 1,56 57 kOnR #_R1
272 |   191 47 1,45 kOffR #_reverse__R1
273 |   192 48 1,46 kOffR #_reverse__R1
274 |   193 47 1,58 kCatR #_R2
275 |   194 48 1,59 kCatR #_R2
276 |   195 2,49 60 kOnL #_R3
277 |   196 2,50 61 kOnL #_R3
278 |   197 2,51 62 kOnL #_R3
279 |   198 52 2,38 kOffL #_reverse__R3
280 |   199 53 2,39 kOffL #_reverse__R3
281 |   200 54 2,40 kOffL #_reverse__R3
282 |   201 55 2,41 kOffL #_reverse__R3
283 |   202 52 2,19 kCatL #_R4
284 |   203 53 2,20 kCatL #_R4
285 |   204 54 2,21 kCatL #_R4
286 |   205 55 2,22 kCatL #_R4
287 |   206 1,55 60 kOnR #_R5
288 |   207 49 1,41 kOffR #_reverse__R5
289 |   208 52 1,44 kOffR #_reverse__R5
290 |   209 49 1,58 kCatR #_R6
291 |   210 52 1,61 kCatR #_R6
292 |   211 2,47 57 kOnL #_R7
293 |   212 2,50 59 kOnL #_R7
294 |   213 2,53 62 kOnL #_R7
295 |   214 48 2,36 kOffL #_reverse__R7
296 |   215 51 2,39 kOffL #_reverse__R7
297 |   216 54 2,43 kOffL #_reverse__R7
298 |   217 56 2,45 kOffL #_reverse__R7
299 |   218 48 2,15 kCatL #_R8
300 |   219 51 2,17 kCatL #_R8
301 |   220 54 2,18 kCatL #_R8
302 |   221 56 2,22 kCatL #_R8
303 |   222 1,54 62 kOnR #_R9
304 |   223 51 1,40 kOffR #_reverse__R9
305 |   224 53 1,43 kOffR #_reverse__R9
306 |   225 51 1,59 kCatR #_R10
307 |   226 53 1,61 kCatR #_R10
308 |   227 2,48 57 kOnL #_R11
309 |   228 2,50 58 kOnL #_R11
310 |   229 2,52 60 kOnL #_R11
311 |   230 47 2,36 kOffL #_reverse__R11
312 |   231 49 2,38 kOffL #_reverse__R11
313 |   232 55 2,44 kOffL #_reverse__R11
314 |   233 56 2,46 kOffL #_reverse__R11
315 |   234 47 2,14 kCatL #_R12
316 |   235 49 2,16 kCatL #_R12
317 |   236 55 2,18 kCatL #_R12
318 |   237 56 2,21 kCatL #_R12
319 |   238 57 1,56 kOffR #_reverse__R1
320 |   239 57 1,63 kCatR #_R2
321 |   240 2,58 64 kOnL #_R3
322 |   241 2,59 65 kOnL #_R3
323 |   242 60 2,49 kOffL #_reverse__R3
324 |   243 61 2,50 kOffL #_reverse__R3
325 |   244 62 2,51 kOffL #_reverse__R3
326 |   245 60 2,32 kCatL #_R4
327 |   246 61 2,33 kCatL #_R4
328 |   247 62 2,34 kCatL #_R4
329 |   248 60 1,55 kOffR #_reverse__R5
330 |   249 60 1,64 kCatR #_R6
331 |   250 2,58 63 kOnL #_R7
332 |   251 2,61 65 kOnL #_R7
333 |   252 57 2,47 kOffL #_reverse__R7
334 |   253 59 2,50 kOffL #_reverse__R7
335 |   254 62 2,53 kOffL #_reverse__R7
336 |   255 57 2,26 kCatL #_R8
337 |   256 59 2,29 kCatL #_R8
338 |   257 62 2,31 kCatL #_R8
339 |   258 62 1,54 kOffR #_reverse__R9
340 |   259 62 1,65 kCatR #_R10
341 |   260 2,59 63 kOnL #_R11
342 |   261 2,61 64 kOnL #_R11
343 |   262 57 2,48 kOffL #_reverse__R11
344 |   263 58 2,50 kOffL #_reverse__R11
345 |   264 60 2,52 kOffL #_reverse__R11
346 |   265 57 2,25 kCatL #_R12
347 |   266 58 2,28 kCatL #_R12
348 |   267 60 2,30 kCatL #_R12
349 |   268 2,63 66 kOnL #_R3
350 |   269 64 2,58 kOffL #_reverse__R3
351 |   270 65 2,59 kOffL #_reverse__R3
352 |   271 64 2,45 kCatL #_R4
353 |   272 65 2,46 kCatL #_R4
354 |   273 2,64 66 kOnL #_R7
355 |   274 63 2,58 kOffL #_reverse__R7
356 |   275 65 2,61 kOffL #_reverse__R7
357 |   276 63 2,41 kCatL #_R8
358 |   277 65 2,44 kCatL #_R8
359 |   278 2,65 66 kOnL #_R11
360 |   279 63 2,59 kOffL #_reverse__R11
361 |   280 64 2,61 kOffL #_reverse__R11
362 |   281 63 2,40 kCatL #_R12
363 |   282 64 2,43 kCatL #_R12
364 |   283 66 2,63 kOffL #_reverse__R3
365 |   284 66 2,56 kCatL #_R4
366 |   285 66 2,64 kOffL #_reverse__R7
367 |   286 66 2,55 kCatL #_R8
368 |   287 66 2,65 kOffL #_reverse__R11
369 |   288 66 2,54 kCatL #_R12
370 | end reactions
371 | begin groups
372 |     1 Rfree                1
373 |     2 Lfree                2
374 |     3 A1P                  9,16,17,18,27,28,29,30,31,38,39,40,41,42,43,44,49,50,51,52,53,54,55,58,59,60,61,62,63,64,65,66
375 | end groups
376 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GPUODEBenchmarks
  2 | Comparison of Julia's GPU-based ensemble ODE solvers with other open-source implementations in C++, JAX, and PyTorch. These artifacts are part of the paper:
  3 | > Automated Translation and Accelerated Solving of Differential Equations on Multiple GPU Platforms
  4 | 
  5 | **_NOTE:_**  This repository is meant to contain scripts for benchmarking existing ensemble ODE solvers. For external purposes, one can directly use the solvers from the respective libraries. 
  6 | 
  7 | ### Performance comparison with other open-source ensemble ODE solvers
  8 | <img src="https://github.com/utkarsh530/GPUODEBenchmarks/blob/main/paper_artifacts/figures/Lorenz_unadaptive.png" alt="drawing" width="50%"/>
  9 | 
 10 | ### Works with NVIDIA, Intel, AMD, and Apple GPUs
 11 | <img src="https://github.com/utkarsh530/GPUODEBenchmarks/blob/main/paper_artifacts/figures/Multi_GPU_unadaptive.png" alt="drawing" width="50%"/>
 12 | 
 13 | # Reproduction of the benchmarks
 14 | 
 15 | The methods are written in Julia and are part of the repository
 16 | <https://github.com/SciML/DiffEqGPU.jl>. The benchmark suite also
 17 | consists of the raw data, such as simulation times and plots mentioned
 18 | in the paper. The supported OS for the benchmark suite is Linux.
 19 | 
 20 | ## Installing Julia
 21 | 
 22 | Firstly, we will need to install Julia. The user can download the
 23 | binaries from the official JuliaLang website
 24 | [`https://julialang.org/downloads/`](https://julialang.org/downloads/).
 25 | Alternatively, one can use the convenience of a Julia version
 26 | multiplexer, <https://github.com/JuliaLang/juliaup>. The recommended OS
 27 | for installation is Linux. The recommended Julia installation version is
 28 | v1.8. To use AMD GPUs, please install v1.9. The Julia installation
 29 | should also be added to the user's path.
 30 | 
 31 | ## Setting up DiffEqGPU.jl
 32 | 
 33 | ### Installing backends
 34 | 
 35 | The user must install the GPU backend library for testing
 36 | DiffEqGPU.jl-related code.
 37 | 
 38 | ```julia
 39 |     julia> using Pkg
 40 |     julia> #Run either of them
 41 |     julia> Pkg.add("CUDA") # NVIDIA GPUs
 42 |     julia> Pkg.add("AMDGPU") #AMD GPUs
 43 |     julia> Pkg.add("oneAPI") #Intel GPUs
 44 |     julia> Pkg.add("Metal") #Apple M series GPUs
 45 | ```
 46 | ### Testing DiffEqGPU.jl
 47 | 
 48 | DiffEqGPU.jl is a test suite that regularly checks functionality by
 49 | testing features like multiple backend support, event handling, and
 50 | automatic differentiation. To test the functionality, one can follow the
 51 | below instructions. The user needs to specify the \"backend\" for
 52 | example \"CUDA\" for NVIDIA, \"AMDGPU\" for AMD, \"oneAPI\" for Intel
 53 | , and \"Metal\" for Apple GPUs. The estimated time of completion is 20
 54 | minutes.
 55 | ```julia
 56 |     $ julia --project=.
 57 |     julia> using Pkg
 58 |     julia> Pkg.instantiate()
 59 |     julia> Pkg.precompile()
 60 | ```
 61 | Finally, test the package with this command
 62 | ```bash
 63 |     $ backend="CUDA"
 64 |     $ julia --project=. test_DiffEqGPU.jl $backend
 65 | ```
 66 | Additionally, the GitHub discussion
 67 | [`https://github.com/SciML/DiffEqGPU.jl/issues/224#issuecomment-1453769679`](https://github.com/SciML/DiffEqGPU.jl/issues/224#issuecomment-1453769679)
 68 | highlights the use of textured memory with ODE solvers, accelerates the
 69 | code by $2\times$ over CPU.
 70 | 
 71 | ### Continuous Integration and Development
 72 | 
 73 | DiffEqGPU.jl is a fully featured library with regression testing, semver
 74 | versioning, and version control. The tests are performed on cloud
 75 | machines having a multitude of different GPUs
 76 | [`https://buildkite.com/julialang/diffeqgpu-dot-jl/builds/705`](https://buildkite.com/julialang/diffeqgpu-dot-jl/builds/705).
 77 | These tests are approximately complete in 30 minutes. The publicly visible
 78 | testing framework serves as a testimonial of compatibility with multiple
 79 | platforms and said features in the paper.
 80 | 
 81 | ## Testing GPU-accelerated ODE Benchmarks with other programs
 82 | 
 83 | ### Benchmarking Julia (DiffEqGPU.jl) methods
 84 | We will need to install CUDA.jl for benchmarking. It is the only backend
 85 | compatible with the ODE solvers in JAX, PyTorch, and MPGOS. To do so,
 86 | one can follow the below process in the Julia Terminal:
 87 | ```julia
 88 |     $ julia
 89 |     julia> using Pkg
 90 |     julia> Pkg.add("CUDA")
 91 | ```
 92 | Let's clone the benchmark suite repository to start benchmarking;
 93 | ```bash
 94 |     $ git clone https://github.com/utkarsh530\
 95 |     /GPUODEBenchmarks.git
 96 | ```
 97 | We will instantiate and pre-compile all the packages beforehand to avoid
 98 | the wait times during benchmarking. The folder ./GPU_ODE_Julia contains
 99 | all the related scripts for the GPU solvers.
100 | ```bash
101 |     $ cd ./GPUODEBenchmarks
102 |     $ julia --project=./GPU_ODE_Julia --threads=auto
103 |     julia> using Pkg
104 |     julia> Pkg.instantiate()
105 |     julia> Pkg.precompile()
106 |     julia> exit()
107 | ```
108 | It may take a few minutes to complete (\< 10 minutes). After this, we
109 | can generate the timings of ODE solvers written in Julia. There is a
110 | script to benchmark ODE solvers for the different number of trajectories
111 | to demonstrate scalability and performance. The script invocation and
112 | timings can be generated through the following:
113 | ```bash
114 |     $ bash ./run_benchmark.sh -l julia -d gpu -m ode
115 | ```
116 | It might take around 20 minutes to finish. The flag `-n N` can be used
117 | to specify the upper bound of the trajectories to benchmark. By default
118 | $N = 2^{24}$, where the simulation runs for $n \in 8 \le n < N$, with
119 | the multiples of $4$.
120 | 
121 | The data will be generated in the `data/Julia` directory, with two files
122 | for fixed and adaptive time-stepping simulations. The first column in
123 | the \".txt\" file will be the number of trajectories, and the section
124 | column will contain the time in milliseconds.
125 | 
126 | Additionally, to benchmark ODE solvers for other backends:
127 | ```bash
128 |     $ N = $((2**24))
129 |     Benchmark
130 |     $ backend = "Metal"
131 |     $ ./runner_scripts/gpu/run_ode_mult_device.sh\
132 |     $N $backend
133 | ```
134 | ### Benchmarking C++ (MPGOS) ODE solvers
135 | 
136 | Benchmarking MPGOS ODE solvers requires the CUDA C++ compiler to be
137 | installed correctly. The recommended CUDA Toolkit version is \>= 11. The
138 | installation can be checked through:
139 | ```bash
140 |     $ nvcc
141 |     If the installation exists, it will return 
142 |     something like this:
143 |     nvcc fatal   : No input files specified; 
144 |     use option --help for more information
145 | ```
146 | If `nvcc` is not found, the user must install the CUDA Toolkit. The
147 | NVIDIA's website lists the resource
148 | [`https://developer.nvidia.com/cuda-downloads`](https://developer.nvidia.com/cuda-downloads)
149 | for installation.
150 | 
151 | The MPGOS scripts are in the `GPU_ODE_MPGOS` folder. The file
152 | `GPU_ODE_MPGOS/Lorenz.cu` is the main executed code. However, the MPGOS
153 | programs can be run with the same bash script by changing the arguments
154 | as:
155 | ```bash
156 |     $ bash ./run_benchmark.sh -l cpp -d gpu -m ode
157 | ```
158 | It will generate the data files in the `data/cpp` folder.
159 | 
160 | ### Benchmarking JAX (Diffrax) ODE solvers
161 | 
162 | Benchmarking JAX-based ODE solvers require installing Python 3.9 and
163 | `conda`. First, we will install all the Python packages for
164 | benchmarking:
165 | ```bash
166 |     $ conda env create -f environment.yml
167 |     $ conda activate venv_jax
168 | ```
169 | It should install the correct version of JAX with CUDA enabled and the
170 | Diffrax library. The GitHub
171 | [`https://github.com/google/jax#installation`](https://github.com/google/jax#installation)
172 | is a guide to follow if the installation fails.
173 | 
174 | For our purposes, we can benchmark the solvers by:
175 | ```bash
176 |     $ bash ./run_benchmark.sh -l jax -d gpu -m ode
177 | ```
178 | 
179 | #### A note on JIT ordering in JAX
180 | 
181 | The JIT ordering JAX matters and sometimes can enhance performance if done correctly. We have tested that vmap and JIT ordering does not make a noticeable difference in our case. The results are available at this [Colab notebook](https://colab.research.google.com/drive/1d7G-O5JX31lHbg7jTzzozbo5-Gp7DBEv?usp=sharing).
182 | 
183 | ### Benchmarking PyTorch (torchdiffeq) ODE solvers
184 | 
185 | Benchmarking PyTorch-based ODE solvers is a similar process compared to
186 | JAX ones.
187 | ```bash
188 |     $ conda env create -f environment.yml
189 |     $ conda activate venv_torch
190 | ```
191 | `torchdiffeq` does not fully support vectorized maps with ODE solvers.
192 | To circumvent this, we extended the functionality by rewriting some
193 | library parts. To download it:
194 | ```bash
195 |     (venv_torch)$ pip uninstall torchdiffeq
196 |     (venv_torch)$ pip uninstall torchdiffeq
197 |     (venv_torch)$ pip install git+https://github.com/\
198 |     utkarsh530/torchdiffeq.git@u/vmap
199 | ```
200 | Then run the benchmarks by:
201 | ```bash
202 |     $ bash ./run_benchmark.sh -l pytorch -d gpu -m ode
203 | ```
204 | ## Comparing GPU acceleration of ODEs with CPUs
205 | 
206 | The benchmark suite can also be used to test the GPU acceleration of ODE
207 | solvers in comparison with CPUs. The process for generating simulation
208 | times for GPUs can be done by following the GPU section mentioned earlier. The following bash script
209 | allows the generation of CPU simulation times for ODEs:
210 | ```bash
211 |     $ bash ./run_benchmark.sh -l julia -d cpu -m ode
212 | ```
213 | The simulation times will be generated in `data/CPU`. Each of the
214 | workflow takes approximately 20 minutes to finish.
215 | 
216 | ## Benchmarking GPU acceleration of SDEs with CPUs
217 | 
218 | The SDE solvers in Julia are benchmarked by comparing them to the
219 | CPU-accelerated simulation. This will benchmark the linear SDE with
220 | three states, as described in the \"Benchmarks and case studies\"
221 | section. To generate simulation times for GPU, do the following:
222 | ```bash
223 |     $ bash ./run_benchmark.sh -l julia -d gpu -m sde
224 | ```
225 | We can generate the simulation times for CPU-accelerated codes through the following:
226 | ```bash
227 |     $ bash ./run_benchmark.sh -l julia -d cpu -m sde
228 | ```
229 | The results will get generated in `data/SDE` and `data/CPU/SDE`, taking
230 | around 10 minutes to complete.
231 | 
232 | ## Composability with MPI
233 | 
234 | Julia supports Message Passing Interface (MPI) to allow Single Program
235 | Multiple Data (SPMD) type parallel programming. The composability of the
236 | GPU ODE solvers enable seamless integration with MPI, enabling scaling
237 | the ODE solvers to clusters on multiple nodes.
238 | ```julia
239 |     $ julia --project=./GPU_ODE_Julia
240 |     julia> using Pkg
241 |     # install MPI.jl
242 |     julia> Pkg.add("MPI")
243 | ```
244 | An example script solving the Lorenz problem for approximately 1 billion
245 | parameters are available in the `MPI` folder. A SLURM-based script is
246 | shown below.
247 | ```bash
248 |     #!/bin/bash
249 |     # Slurm Sbatch Options
250 |     # Reqeust no. of GPUs/node
251 |     #SBATCH --gres=gpu:volta:1
252 |     # 1 process per node 
253 |     #SBATCH -n 5 -N 5
254 |     #SBATCH --output="./mpi_scatter_test.log-%j"
255 |     # Loading the required module
256 | 
257 |     # MPI.jl requires a memory pool to be disabled
258 |     export JULIA_CUDA_MEMORY_POOL=none
259 |     export JULIA_MPI_BINARY=system
260 |     # Use local CUDA toolkit installation
261 |     export JULIA_CUDA_USE_BINARYBUILDER=false
262 | 
263 |     source $HOME/.bashrc
264 |     module load cuda mpi
265 | 
266 |     srun hostname > hostfile
267 |     time mpiexec julia --project=./GPU_ODE_Julia\ 
268 |     ./MPI/gpu_ode_mpi.jl
269 | ```
270 | ## Plotting Results
271 | 
272 | The plotting scripts to visualize the simulation times. The scripts are
273 | located in the `runner_scripts/plot` folder. These scripts replicate the
274 | benchmark figures in the paper. The benchmark suite contains the
275 | simulation data generated by authors, which can be used to verify the
276 | plots. Various benchmarks can be plotted, which are described in the
277 | different sections. The plotting scripts are based on Julia. As a
278 | preliminary step:
279 | ```julia
280 |     $ cd GPUODEBenchmarks
281 |     $ julia project=.
282 |     julia> using Pkg
283 |     julia> Pkg.instantiate()
284 |     julia> Pkg.precompile()
285 | ```
286 | The plot comparison between Julia, C++, JAX, and PyTorch mentioned in
287 | the paper can be generated by using the below command:
288 | ```bash
289 |     $ julia --project=. ./runner_scripts/plot\
290 |     /plot_ode_comp.jl
291 | ```
292 | The plot will get saved in the `plots` folder.
293 | 
294 | Similarly, the other plots in the paper can be generated by running the
295 | different scripts in the folder `runner_scripts/plot`.
296 | ```bash
297 |     plot performance of GPU ODE solvers 
298 |     with multiple backends
299 |     $ julia --project=. ./runner_scripts/plot\
300 |     /plot_mult_gpu.jl 
301 |     plot GPU ODE solvers comparsion with CPUs
302 |     $ julia --project=. ./runner_scripts/plot\
303 |     /plot_ode_comp.jl 
304 |     plot GPU SDE solvers comparsion with CPUs
305 |     $ julia --project=. ./runner_scripts/plot\
306 |     /plot_sde_comp.jl 
307 |     plot CRN Network sim comparison with CPUs
308 |     $ julia --project=. ./runner_scripts/plot\
309 |     /plot_sde_crn.jl 
310 | ```
311 | To plot data generated by running the scripts, specify the location of
312 | the `data` as the argument to the mentioned command.
313 | ```bash
314 |     $ julia --project=. ./runner_scripts/plot/\
315 |     plot_mult_gpu.jl /path/to/data/
316 | ```
317 | 


--------------------------------------------------------------------------------
/GPU_ODE_MPGOS/SourceCodes/SingleSystem_PerThread_Solver.cuh:
--------------------------------------------------------------------------------
  1 | #ifndef SINGLESYSTEM_PERTHREAD_SOLVER_H
  2 | #define SINGLESYSTEM_PERTHREAD_SOLVER_H
  3 | 
  4 | #include "MPGOS_Overloaded_MathFunction.cuh"
  5 | #include "SingleSystem_PerThread_DenseOutput.cuh"
  6 | #include "SingleSystem_PerThread_ExplicitRungeKutta_Steppers.cuh"
  7 | #include "SingleSystem_PerThread_ExplicitRungeKutta_ErrorControllers.cuh"
  8 | #include "SingleSystem_PerThread_EventHandling.cuh"
  9 | 
 10 | 
 11 | template <int NT, int SD, int NCP, int NSP, int NISP, int NE, int NA, int NIA, int NDO, Algorithms Algorithm, class Precision>
 12 | __global__ void SingleSystem_PerThread(Struct_ThreadConfiguration ThreadConfiguration, Struct_GlobalVariables<Precision> GlobalVariables, Struct_SharedMemoryUsage SharedMemoryUsage, Struct_SolverOptions<Precision> SolverOptions)
 13 | {
 14 | 	// THREAD MANAGEMENT ------------------------------------------------------
 15 | 	int tid = threadIdx.x + blockIdx.x*blockDim.x;
 16 | 	
 17 | 	
 18 | 	// SHARED MEMORY MANAGEMENT -----------------------------------------------
 19 | 	//    DUE TO REQUIRED MEMORY ALIGMENT: PRECISONS FIRST, INTS NEXT IN DYNAMICALLY ALLOCATED SHARED MEMORY
 20 | 	//    MINIMUM ALLOCABLE MEMORY IS 1
 21 | 	extern __shared__ int DynamicSharedMemory[];
 22 | 	int MemoryShift;
 23 | 	
 24 | 	Precision* gs_SharedParameters = (Precision*)&DynamicSharedMemory;
 25 | 		MemoryShift = (SharedMemoryUsage.PreferSharedMemory  == 1 ? NSP : 0);
 26 | 	
 27 | 	int* gs_IntegerSharedParameters = (int*)&gs_SharedParameters[MemoryShift];
 28 | 	
 29 | 	const bool IsAdaptive  = ( Algorithm==RK4 ? 0 : 1 );
 30 | 	
 31 | 	__shared__ Precision s_RelativeTolerance[ (IsAdaptive==0 ? 1 : SD) ];
 32 | 	__shared__ Precision s_AbsoluteTolerance[ (IsAdaptive==0 ? 1 : SD) ];
 33 | 	__shared__ Precision s_EventTolerance[ (NE==0 ? 1 : NE) ];
 34 | 	__shared__ int       s_EventDirection[ (NE==0 ? 1 : NE) ];
 35 | 	
 36 | 	// Initialise tolerances of adaptive solvers
 37 | 	if ( IsAdaptive  == 1 )
 38 | 	{
 39 | 		const int LaunchesSD = SD / blockDim.x + (SD % blockDim.x == 0 ? 0 : 1);
 40 | 		#pragma unroll
 41 | 		for (int j=0; j<LaunchesSD; j++)
 42 | 		{
 43 | 			int ltid = threadIdx.x + j*blockDim.x;
 44 | 			
 45 | 			if ( ltid < SD)
 46 | 			{
 47 | 				s_RelativeTolerance[ltid] = GlobalVariables.d_RelativeTolerance[ltid];
 48 | 				s_AbsoluteTolerance[ltid] = GlobalVariables.d_AbsoluteTolerance[ltid];
 49 | 			}
 50 | 		}
 51 | 	}
 52 | 	
 53 | 	// Initialise shared event handling variables
 54 | 	const int LaunchesNE = NE / blockDim.x + (NE % blockDim.x == 0 ? 0 : 1);
 55 | 	#pragma unroll
 56 | 	for (int j=0; j<LaunchesNE; j++)
 57 | 	{
 58 | 		int ltid = threadIdx.x + j*blockDim.x;
 59 | 		
 60 | 		if ( ltid < SD)
 61 | 		{
 62 | 			s_EventTolerance[ltid] = GlobalVariables.d_EventTolerance[ltid];
 63 | 			s_EventDirection[ltid] = GlobalVariables.d_EventDirection[ltid];
 64 | 		}
 65 | 	}
 66 | 	
 67 | 	// Initialise shared parameters
 68 | 	if ( SharedMemoryUsage.PreferSharedMemory == 0 )
 69 | 	{
 70 | 		gs_SharedParameters        = GlobalVariables.d_SharedParameters;
 71 | 		gs_IntegerSharedParameters = GlobalVariables.d_IntegerSharedParameters;
 72 | 	} else
 73 | 	{
 74 | 		const int MaxElementNumber = max( NSP, NISP );
 75 | 		const int LaunchesSP       = MaxElementNumber / blockDim.x + (MaxElementNumber % blockDim.x == 0 ? 0 : 1);
 76 | 		
 77 | 		#pragma unroll
 78 | 		for (int i=0; i<LaunchesSP; i++)
 79 | 		{
 80 | 			int ltid = threadIdx.x + i*blockDim.x;
 81 | 			
 82 | 			if ( ltid < NSP )
 83 | 				gs_SharedParameters[ltid] = GlobalVariables.d_SharedParameters[ltid];
 84 | 			
 85 | 			if ( ltid < NISP )
 86 | 				gs_IntegerSharedParameters[ltid] = GlobalVariables.d_IntegerSharedParameters[ltid];
 87 | 		}
 88 | 	}
 89 | 	
 90 | 	
 91 | 	if (tid < ThreadConfiguration.NumberOfActiveThreads)
 92 | 	{
 93 | 		// REGISTER MEMORY MANAGEMENT -----------------------------------------
 94 | 		//    MINIMUM ALLOCABLE MEMORY IS 1
 95 | 		Precision r_TimeDomain[2];
 96 | 		Precision r_ActualState[SD];
 97 | 		Precision r_NextState[SD];
 98 | 		Precision r_Error[SD];
 99 | 		Precision r_ControlParameters[ (NCP==0 ? 1 : NCP) ];
100 | 		Precision r_Accessories[ (NA==0 ? 1 : NA) ];
101 | 		int       r_IntegerAccessories[ (NIA==0 ? 1 : NIA) ];
102 | 		Precision r_ActualEventValue[ (NE==0 ? 1 : NE) ];
103 | 		Precision r_NextEventValue[ (NE==0 ? 1 : NE) ];
104 | 		Precision r_ActualTime;
105 | 		Precision r_TimeStep;
106 | 		Precision r_NewTimeStep;
107 | 		Precision r_DenseOutputActualTime;
108 | 		int       r_DenseOutputIndex;
109 | 		int       r_UpdateDenseOutput;
110 | 		int       r_NumberOfSkippedStores;
111 | 		int       r_IsFinite;
112 | 		int       r_TerminateSimulation;
113 | 		int       r_UserDefinedTermination;
114 | 		int       r_UpdateStep;
115 | 		int       r_EndTimeDomainReached;
116 | 		
117 | 		#pragma unroll
118 | 		for (int i=0; i<2; i++)
119 | 			r_TimeDomain[i] = GlobalVariables.d_TimeDomain[tid + i*NT];
120 | 		
121 | 		#pragma unroll
122 | 		for (int i=0; i<SD; i++)
123 | 			r_ActualState[i] = GlobalVariables.d_ActualState[tid + i*NT];
124 | 		
125 | 		#pragma unroll
126 | 		for (int i=0; i<NCP; i++)
127 | 			r_ControlParameters[i] = GlobalVariables.d_ControlParameters[tid + i*NT];
128 | 		
129 | 		#pragma unroll
130 | 		for (int i=0; i<NA; i++)
131 | 			r_Accessories[i] = GlobalVariables.d_Accessories[tid + i*NT];
132 | 		
133 | 		#pragma unroll
134 | 		for (int i=0; i<NIA; i++)
135 | 			r_IntegerAccessories[i] = GlobalVariables.d_IntegerAccessories[tid + i*NT];
136 | 		
137 | 		r_ActualTime             = GlobalVariables.d_ActualTime[tid];
138 | 		r_TimeStep               = SolverOptions.InitialTimeStep;
139 | 		r_NewTimeStep            = SolverOptions.InitialTimeStep;
140 | 		r_DenseOutputIndex       = GlobalVariables.d_DenseOutputIndex[tid];
141 | 		r_UpdateDenseOutput      = 1;
142 | 		r_NumberOfSkippedStores  = 0;
143 | 		r_TerminateSimulation    = 0;
144 | 		r_UserDefinedTermination = 0;
145 | 		
146 | 		
147 | 		// INITIALISATION -----------------------------------------------------
148 | 		PerThread_Initialization<Precision>(\
149 | 			tid, \
150 | 			NT, \
151 | 			r_DenseOutputIndex, \
152 | 			r_ActualTime, \
153 | 			r_TimeStep, \
154 | 			r_TimeDomain, \
155 | 			r_ActualState, \
156 | 			r_ControlParameters, \
157 | 			gs_SharedParameters, \
158 | 			gs_IntegerSharedParameters, \
159 | 			r_Accessories, \
160 | 			r_IntegerAccessories);
161 | 		
162 | 		if ( NE > 0 )
163 | 		{
164 | 			PerThread_EventFunction<Precision>(\
165 | 				tid, \
166 | 				NT, \
167 | 				r_ActualEventValue, \
168 | 				r_ActualTime, \
169 | 				r_TimeStep, \
170 | 				r_TimeDomain, \
171 | 				r_ActualState, \
172 | 				r_ControlParameters, \
173 | 				gs_SharedParameters, \
174 | 				gs_IntegerSharedParameters, \
175 | 				r_Accessories, \
176 | 				r_IntegerAccessories);
177 | 		}
178 | 		
179 | 		if ( NDO > 0 )
180 | 		{
181 | 			PerThread_StoreDenseOutput<NT, SD, NDO, Precision>(\
182 | 				tid, \
183 | 				r_UpdateDenseOutput, \
184 | 				r_DenseOutputIndex, \
185 | 				GlobalVariables.d_DenseOutputTimeInstances, \
186 | 				r_ActualTime, \
187 | 				GlobalVariables.d_DenseOutputStates, \
188 | 				r_ActualState, \
189 | 				r_NumberOfSkippedStores, \
190 | 				r_DenseOutputActualTime, \
191 | 				SolverOptions.DenseOutputMinimumTimeStep, \
192 | 				r_TimeDomain[1]);
193 | 		}
194 | 		
195 | 		
196 | 		// SOLVER MANAGEMENT --------------------------------------------------
197 | 		while ( r_TerminateSimulation == 0 )
198 | 		{
199 | 			// INITIALISE TIME STEPPING ---------------------------------------
200 | 			r_UpdateStep           = 1;
201 | 			r_IsFinite             = 1;
202 | 			r_EndTimeDomainReached = 0;
203 | 			
204 | 			r_TimeStep = r_NewTimeStep;
205 | 			
206 | 			if ( r_TimeStep > ( r_TimeDomain[1] - r_ActualTime ) )
207 | 			{
208 | 				r_TimeStep = r_TimeDomain[1] - r_ActualTime;
209 | 				r_EndTimeDomainReached = 1;
210 | 			}
211 | 			
212 | 			
213 | 			// STEPPER --------------------------------------------------------
214 | 			if ( Algorithm == RK4 )
215 | 			{
216 | 				PerThread_Stepper_RK4<NT,SD,Precision>(\
217 | 					tid, \
218 | 					r_ActualTime, \
219 | 					r_TimeStep, \
220 | 					r_ActualState, \
221 | 					r_NextState, \
222 | 					r_Error, \
223 | 					r_IsFinite, \
224 | 					r_ControlParameters, \
225 | 					gs_SharedParameters, \
226 | 					gs_IntegerSharedParameters, \
227 | 					r_Accessories, \
228 | 					r_IntegerAccessories);
229 | 				
230 | 				PerThread_ErrorController_RK4<Precision>(\
231 | 					tid, \
232 | 					SolverOptions.InitialTimeStep, \
233 | 					r_IsFinite, \
234 | 					r_TerminateSimulation, \
235 | 					r_NewTimeStep);
236 | 			}
237 | 			
238 | 			if ( Algorithm == RKCK45 )
239 | 			{
240 | 				PerThread_Stepper_RKCK45<NT,SD,Precision>(\
241 | 					tid, \
242 | 					r_ActualTime, \
243 | 					r_TimeStep, \
244 | 					r_ActualState, \
245 | 					r_NextState, \
246 | 					r_Error, \
247 | 					r_IsFinite, \
248 | 					r_ControlParameters, \
249 | 					gs_SharedParameters, \
250 | 					gs_IntegerSharedParameters, \
251 | 					r_Accessories, \
252 | 					r_IntegerAccessories);
253 | 				
254 | 				PerThread_ErrorController_RKCK45<SD,Precision>(\
255 | 					tid, \
256 | 					r_TimeStep, \
257 | 					r_ActualState, \
258 | 					r_NextState, \
259 | 					r_Error, \
260 | 					s_RelativeTolerance, \
261 | 					s_AbsoluteTolerance, \
262 | 					r_UpdateStep, \
263 | 					r_IsFinite, \
264 | 					r_TerminateSimulation, \
265 | 					r_NewTimeStep, \
266 | 					SolverOptions);
267 | 			}
268 | 			
269 | 			
270 | 			// NEW EVENT VALUE AND TIME STEP CONTROL---------------------------
271 | 			if ( NE > 0 )
272 | 			{
273 | 				PerThread_EventFunction<Precision>(\
274 | 					tid, \
275 | 					NT, \
276 | 					r_NextEventValue, \
277 | 					r_ActualTime+r_TimeStep, \
278 | 					r_TimeStep, \
279 | 					r_TimeDomain, \
280 | 					r_NextState, \
281 | 					r_ControlParameters, \
282 | 					gs_SharedParameters, \
283 | 					gs_IntegerSharedParameters, \
284 | 					r_Accessories, \
285 | 					r_IntegerAccessories);
286 | 				
287 | 				PerThread_EventTimeStepControl<NE,Precision>(\
288 | 					tid, \
289 | 					r_UpdateStep, \
290 | 					r_TerminateSimulation, \
291 | 					r_ActualEventValue, \
292 | 					r_NextEventValue, \
293 | 					s_EventTolerance, \
294 | 					s_EventDirection, \
295 | 					r_TimeStep, \
296 | 					r_NewTimeStep, \
297 | 					SolverOptions.MinimumTimeStep);
298 | 			}
299 | 			
300 | 			
301 | 			// UPDATE PROCESS -------------------------------------------------
302 | 			if ( r_UpdateStep == 1 )
303 | 			{
304 | 				r_ActualTime += r_TimeStep;
305 | 				
306 | 				for (int i=0; i<SD; i++)
307 | 					r_ActualState[i] = r_NextState[i];
308 | 				
309 | 				PerThread_ActionAfterSuccessfulTimeStep<Precision>(\
310 | 					tid, \
311 | 					NT, \
312 | 					r_UserDefinedTermination, \
313 | 					r_ActualTime, \
314 | 					r_TimeStep, \
315 | 					r_TimeDomain, \
316 | 					r_ActualState, \
317 | 					r_ControlParameters, \
318 | 					gs_SharedParameters, \
319 | 					gs_IntegerSharedParameters, \
320 | 					r_Accessories, \
321 | 					r_IntegerAccessories);
322 | 				
323 | 				if ( NE > 0 )
324 | 				{
325 | 					for (int i=0; i<NE; i++)
326 | 					{
327 | 						if ( ( ( r_ActualEventValue[i] >  s_EventTolerance[i] ) && ( abs(r_NextEventValue[i]) < s_EventTolerance[i] ) && ( s_EventDirection[i] <= 0 ) ) || \
328 | 							 ( ( r_ActualEventValue[i] < -s_EventTolerance[i] ) && ( abs(r_NextEventValue[i]) < s_EventTolerance[i] ) && ( s_EventDirection[i] >= 0 ) ) )
329 | 						{
330 | 							PerThread_ActionAfterEventDetection<Precision>(\
331 | 								tid, \
332 | 								NT, \
333 | 								i, \
334 | 								r_UserDefinedTermination, \
335 | 								r_ActualTime, \
336 | 								r_TimeStep, \
337 | 								r_TimeDomain, \
338 | 								r_ActualState, \
339 | 								r_ControlParameters, \
340 | 								gs_SharedParameters, \
341 | 								gs_IntegerSharedParameters, \
342 | 								r_Accessories, \
343 | 								r_IntegerAccessories);
344 | 						}
345 | 					}
346 | 					
347 | 					PerThread_EventFunction<Precision>(\
348 | 						tid, \
349 | 						NT, \
350 | 						r_NextEventValue, \
351 | 						r_ActualTime, \
352 | 						r_TimeStep, \
353 | 						r_TimeDomain, \
354 | 						r_ActualState, \
355 | 						r_ControlParameters, \
356 | 						gs_SharedParameters, \
357 | 						gs_IntegerSharedParameters, \
358 | 						r_Accessories, 
359 | 						r_IntegerAccessories);
360 | 					
361 | 					for (int i=0; i<NE; i++)
362 | 						r_ActualEventValue[i] = r_NextEventValue[i];
363 | 				}
364 | 				
365 | 				if ( NDO > 0 )
366 | 				{
367 | 					PerThread_DenseOutputStorageCondition<NDO, Precision>(\
368 | 						r_ActualTime, \
369 | 						r_DenseOutputActualTime, \
370 | 						r_DenseOutputIndex, \
371 | 						r_NumberOfSkippedStores, \
372 | 						r_EndTimeDomainReached, \
373 | 						r_UserDefinedTermination, \
374 | 						r_UpdateDenseOutput, \
375 | 						SolverOptions);
376 | 					
377 | 					PerThread_StoreDenseOutput<NT, SD, NDO, Precision>(\
378 | 						tid, \
379 | 						r_UpdateDenseOutput, \
380 | 						r_DenseOutputIndex, \
381 | 						GlobalVariables.d_DenseOutputTimeInstances, \
382 | 						r_ActualTime, \
383 | 						GlobalVariables.d_DenseOutputStates, \
384 | 						r_ActualState, \
385 | 						r_NumberOfSkippedStores, \
386 | 						r_DenseOutputActualTime, \
387 | 						SolverOptions.DenseOutputMinimumTimeStep, \
388 | 						r_TimeDomain[1]);
389 | 				}
390 | 				
391 | 				if ( ( r_EndTimeDomainReached == 1 ) || ( r_UserDefinedTermination == 1 ) )
392 | 					r_TerminateSimulation = 1;
393 | 			}
394 | 		}
395 | 		
396 | 		
397 | 		// FINALISATION -----------------------------------------------------------
398 | 		PerThread_Finalization(\
399 | 			tid, \
400 | 			NT, \
401 | 			r_DenseOutputIndex, \
402 | 			r_ActualTime, \
403 | 			r_TimeStep, \
404 | 			r_TimeDomain, \
405 | 			r_ActualState, \
406 | 			r_ControlParameters, \
407 | 			gs_SharedParameters, \
408 | 			gs_IntegerSharedParameters, \
409 | 			r_Accessories, \
410 | 			r_IntegerAccessories);
411 | 		
412 | 		
413 | 		// WRITE DATA BACK TO GLOBAL MEMORY ---------------------------------------
414 | 		#pragma unroll
415 | 		for (int i=0; i<2; i++)
416 | 			GlobalVariables.d_TimeDomain[tid + i*NT] = r_TimeDomain[i];
417 | 		
418 | 		#pragma unroll
419 | 		for (int i=0; i<SD; i++)
420 | 			GlobalVariables.d_ActualState[tid + i*NT] = r_ActualState[i];
421 | 		
422 | 		#pragma unroll
423 | 		for (int i=0; i<NCP; i++)
424 | 			GlobalVariables.d_ControlParameters[tid + i*NT] = r_ControlParameters[i];
425 | 		
426 | 		#pragma unroll
427 | 		for (int i=0; i<NA; i++)
428 | 			GlobalVariables.d_Accessories[tid + i*NT] = r_Accessories[i];
429 | 		
430 | 		#pragma unroll
431 | 		for (int i=0; i<NIA; i++)
432 | 			GlobalVariables.d_IntegerAccessories[tid + i*NT] = r_IntegerAccessories[i];
433 | 		
434 | 		GlobalVariables.d_ActualTime[tid]       = r_ActualTime;
435 | 		GlobalVariables.d_DenseOutputIndex[tid] = r_DenseOutputIndex;
436 | 	}
437 | }
438 | 
439 | #endif


--------------------------------------------------------------------------------
/GPU_ODE_MPGOS/SourceCodes/CoupledSystems_PerBlock_DenseOutput.cuh:
--------------------------------------------------------------------------------
  1 | #ifndef COUPLEDSYSTEMs_PERBLOCK_DENSEOUTPUT_H
  2 | #define COUPLEDSYSTEMs_PERBLOCK_DENSEOUTPUT_H
  3 | 
  4 | 
  5 | // MSMBL ----------------------------------------------------------------------
  6 | template <int NBL, int NS, int UPS, int UD, int SPB, class Precision>
  7 | __forceinline__ __device__ void CoupledSystems_PerBlock_MultipleSystems_MultipleBlockLaunches_StoreDenseOutput(\
  8 | 			int*       s_UpdateStep, \
  9 | 			int*       s_UpdateDenseOutput, \
 10 | 			int*       s_DenseOutputIndex, \
 11 | 			int*       s_NumberOfSkippedStores, \
 12 | 			Precision* d_DenseOutputTimeInstances, \
 13 | 			Precision* d_DenseOutputStates, \
 14 | 			Precision* s_DenseOutputActualTime, \
 15 | 			Precision* s_ActualTime, \
 16 | 			Precision  r_ActualState[NBL][UD], \
 17 | 			Precision  s_TimeDomain[SPB][2], \
 18 | 			Struct_ThreadConfiguration ThreadConfiguration, \
 19 | 			Struct_SolverOptions<Precision> SolverOptions)
 20 | {
 21 | 	int LocalThreadID_GPU = threadIdx.x;
 22 | 	int BlockID           = blockIdx.x;
 23 | 	int GlobalThreadID_Logical;
 24 | 	int LocalThreadID_Logical;
 25 | 	int LocalSystemID;
 26 | 	int GlobalSystemID;
 27 | 	int GlobalMemoryID;
 28 | 	int UnitID;
 29 | 	
 30 | 	int SizeOfActualState = ThreadConfiguration.TotalLogicalThreads*UD;
 31 | 	
 32 | 	for (int BL=0; BL<NBL; BL++)
 33 | 	{
 34 | 		LocalThreadID_Logical = LocalThreadID_GPU + BL*blockDim.x;
 35 | 		LocalSystemID         = LocalThreadID_Logical / UPS;
 36 | 		UnitID                = LocalThreadID_Logical % UPS;
 37 | 		GlobalSystemID        = LocalSystemID + BlockID*SPB;
 38 | 		
 39 | 		if ( ( LocalSystemID < SPB ) && ( s_UpdateDenseOutput[LocalSystemID] == 1 ) )
 40 | 		{
 41 | 			for (int i=0; i<UD; i++)
 42 | 			{
 43 | 				GlobalThreadID_Logical = BlockID*(ThreadConfiguration.LogicalThreadsPerBlock+ThreadConfiguration.ThreadPaddingPerBlock) + LocalSystemID*UPS + UnitID;
 44 | 				GlobalMemoryID         = GlobalThreadID_Logical + i*ThreadConfiguration.TotalLogicalThreads + s_DenseOutputIndex[LocalSystemID]*SizeOfActualState;
 45 | 				
 46 | 				d_DenseOutputStates[GlobalMemoryID] = r_ActualState[BL][i];
 47 | 			}
 48 | 			
 49 | 			if ( UnitID == 0 )
 50 | 			{
 51 | 				GlobalMemoryID = GlobalSystemID + s_DenseOutputIndex[LocalSystemID]*NS;
 52 | 				d_DenseOutputTimeInstances[GlobalMemoryID] = s_ActualTime[LocalSystemID];
 53 | 			}
 54 | 		}
 55 | 		
 56 | 	}
 57 | 	__syncthreads();
 58 | 	
 59 | 	int Launches = SPB / blockDim.x + (SPB % blockDim.x == 0 ? 0 : 1);
 60 | 	for (int j=0; j<Launches; j++)
 61 | 	{
 62 | 		int lsid = threadIdx.x + j*blockDim.x;
 63 | 		
 64 | 		if ( lsid < SPB )
 65 | 		{
 66 | 			if ( s_UpdateDenseOutput[lsid] == 1 )
 67 | 			{
 68 | 				s_DenseOutputIndex[lsid]++;
 69 | 				s_NumberOfSkippedStores[lsid] = 0;
 70 | 				s_DenseOutputActualTime[lsid] = MPGOS::FMIN(s_ActualTime[lsid]+SolverOptions.DenseOutputMinimumTimeStep, s_TimeDomain[lsid][1]);
 71 | 			}
 72 | 			
 73 | 			if ( ( s_UpdateDenseOutput[lsid] == 0 ) && ( s_UpdateStep[lsid] == 1 ) )
 74 | 				s_NumberOfSkippedStores[lsid]++;
 75 | 		}
 76 | 	}
 77 | 	__syncthreads();
 78 | }
 79 | 
 80 | 
 81 | template <int NBL, int UPS, int SPB, int NDO, class Precision>
 82 | __forceinline__ __device__ void CoupledSystems_PerBlock_MultipleSystems_MultipleBlockLaunches_DenseOutputStorageCondition(\
 83 | 			int*       s_EndTimeDomainReached, \
 84 | 			int*       s_UserDefinedTermination, \
 85 | 			int*       s_UpdateStep, \
 86 | 			int*       s_UpdateDenseOutput, \
 87 | 			int*       s_DenseOutputIndex, \
 88 | 			int*       s_NumberOfSkippedStores, \
 89 | 			Precision* s_DenseOutputActualTime, \
 90 | 			Precision* s_ActualTime, \
 91 | 			Struct_SolverOptions<Precision> SolverOptions)
 92 | {
 93 | 	int Launches = SPB / blockDim.x + (SPB % blockDim.x == 0 ? 0 : 1);
 94 | 	for (int j=0; j<Launches; j++)
 95 | 	{
 96 | 		int lsid = threadIdx.x + j*blockDim.x;
 97 | 		
 98 | 		if ( lsid < SPB )
 99 | 		{
100 | 			if ( s_UpdateStep[lsid] == 1 )
101 | 			{
102 | 				if ( ( s_DenseOutputIndex[lsid] < NDO ) && ( s_DenseOutputActualTime[lsid] < s_ActualTime[lsid] ) && ( s_NumberOfSkippedStores[lsid] >= (SolverOptions.DenseOutputSaveFrequency-1) ) )
103 | 					s_UpdateDenseOutput[lsid] = 1;
104 | 				else
105 | 					s_UpdateDenseOutput[lsid] = 0;
106 | 				
107 | 				if ( ( s_DenseOutputIndex[lsid] < NDO ) && ( ( s_EndTimeDomainReached[lsid] == 1 ) || ( s_UserDefinedTermination[lsid] == 1 ) ) )
108 | 					s_UpdateDenseOutput[lsid] = 1;
109 | 			} else
110 | 				s_UpdateDenseOutput[lsid] = 0;
111 | 		}
112 | 	}
113 | 	__syncthreads();
114 | }
115 | 
116 | 
117 | // SSMBL ----------------------------------------------------------------------
118 | template <int NBL, int NS, int UPS, int UD, class Precision>
119 | __forceinline__ __device__ void CoupledSystems_PerBlock_SingleSystem_MultipleBlockLaunches_StoreDenseOutput(\
120 | 			int        s_UpdateStep, \
121 | 			int        s_UpdateDenseOutput, \
122 | 			int&       s_DenseOutputIndex, \
123 | 			int&       s_NumberOfSkippedStores, \
124 | 			Precision* d_DenseOutputTimeInstances, \
125 | 			Precision* d_DenseOutputStates, \
126 | 			Precision& s_DenseOutputActualTime, \
127 | 			Precision  s_ActualTime, \
128 | 			Precision  r_ActualState[NBL][UD], \
129 | 			Precision  s_TimeDomain[2], \
130 | 			Struct_ThreadConfiguration ThreadConfiguration, \
131 | 			Struct_SolverOptions<Precision> SolverOptions)
132 | {
133 | 	int LocalThreadID_GPU = threadIdx.x;
134 | 	int GlobalSystemID    = blockIdx.x;
135 | 	int GlobalThreadID_Logical;
136 | 	int UnitID;
137 | 	int GlobalMemoryID;
138 | 	
139 | 	int SizeOfActualState = ThreadConfiguration.TotalLogicalThreads*UD;
140 | 	
141 | 	for (int BL=0; BL<NBL; BL++)
142 | 	{
143 | 		UnitID = LocalThreadID_GPU + BL*blockDim.x;
144 | 		GlobalThreadID_Logical = GlobalSystemID*(ThreadConfiguration.LogicalThreadsPerBlock+ThreadConfiguration.ThreadPaddingPerBlock) + UnitID;
145 | 		
146 | 		if ( ( UnitID < UPS ) && ( s_UpdateDenseOutput == 1 ) )
147 | 		{
148 | 			for (int i=0; i<UD; i++)
149 | 			{
150 | 				GlobalMemoryID = GlobalThreadID_Logical + i*ThreadConfiguration.TotalLogicalThreads + s_DenseOutputIndex*SizeOfActualState;
151 | 				
152 | 				d_DenseOutputStates[GlobalMemoryID] = r_ActualState[BL][i];
153 | 			}
154 | 			
155 | 			if ( UnitID == 0 )
156 | 			{
157 | 				GlobalMemoryID = GlobalSystemID + s_DenseOutputIndex*NS;
158 | 				d_DenseOutputTimeInstances[GlobalMemoryID] = s_ActualTime;
159 | 			}
160 | 		}
161 | 		
162 | 	}
163 | 	__syncthreads();
164 | 	
165 | 	if ( threadIdx.x == 0 )
166 | 	{
167 | 		if ( s_UpdateDenseOutput == 1 )
168 | 		{
169 | 			s_DenseOutputIndex++;
170 | 			s_NumberOfSkippedStores = 0;
171 | 			s_DenseOutputActualTime = MPGOS::FMIN(s_ActualTime+SolverOptions.DenseOutputMinimumTimeStep, s_TimeDomain[1]);
172 | 		}
173 | 		
174 | 		if ( ( s_UpdateDenseOutput == 0 ) && ( s_UpdateStep == 1 ) )
175 | 			s_NumberOfSkippedStores++;
176 | 	}
177 | 	__syncthreads();
178 | }
179 | 
180 | 
181 | template <int NDO, class Precision>
182 | __forceinline__ __device__ void CoupledSystems_PerBlock_SingleSystem_MultipleBlockLaunches_DenseOutputStorageCondition(\
183 | 			int       s_EndTimeDomainReached, \
184 | 			int       s_UserDefinedTermination, \
185 | 			int       s_UpdateStep, \
186 | 			int&      s_UpdateDenseOutput, \
187 | 			int       s_DenseOutputIndex, \
188 | 			int       s_NumberOfSkippedStores, \
189 | 			Precision s_DenseOutputActualTime, \
190 | 			Precision s_ActualTime, \
191 | 			Struct_SolverOptions<Precision> SolverOptions)
192 | {
193 | 	if ( threadIdx.x == 0 )
194 | 	{
195 | 		if ( s_UpdateStep == 1 )
196 | 		{
197 | 			if ( ( s_DenseOutputIndex < NDO ) && ( s_DenseOutputActualTime < s_ActualTime ) && ( s_NumberOfSkippedStores >= (SolverOptions.DenseOutputSaveFrequency-1) ) )
198 | 				s_UpdateDenseOutput = 1;
199 | 			else
200 | 				s_UpdateDenseOutput = 0;
201 | 			
202 | 			if ( ( s_DenseOutputIndex < NDO ) && ( ( s_EndTimeDomainReached == 1 ) || ( s_UserDefinedTermination == 1 ) ) )
203 | 				s_UpdateDenseOutput = 1;
204 | 		} else
205 | 			s_UpdateDenseOutput = 0;
206 | 	}
207 | 	__syncthreads();
208 | }
209 | 
210 | 
211 | // MSSBL ----------------------------------------------------------------------
212 | template <int NS, int UPS, int UD, int SPB, class Precision>
213 | __forceinline__ __device__ void CoupledSystems_PerBlock_MultipleSystems_SingleBlockLaunch_StoreDenseOutput(\
214 | 			const int  BlockID, \
215 | 			const int  LocalSystemID, \
216 | 			const int  UnitID, \
217 | 			const int  GlobalSystemID, \
218 | 			int*       s_UpdateStep, \
219 | 			int*       s_UpdateDenseOutput, \
220 | 			int*       s_DenseOutputIndex, \
221 | 			int*       s_NumberOfSkippedStores, \
222 | 			Precision* d_DenseOutputTimeInstances, \
223 | 			Precision* d_DenseOutputStates, \
224 | 			Precision* s_DenseOutputActualTime, \
225 | 			Precision* s_ActualTime, \
226 | 			Precision  r_ActualState[UD], \
227 | 			Precision  s_TimeDomain[SPB][2], \
228 | 			Struct_ThreadConfiguration ThreadConfiguration, \
229 | 			Struct_SolverOptions<Precision> SolverOptions)
230 | {
231 | 	int GlobalThreadID_Logical;
232 | 	int GlobalMemoryID;
233 | 	
234 | 	int SizeOfActualState = ThreadConfiguration.TotalLogicalThreads*UD;
235 | 	
236 | 	if ( ( LocalSystemID < SPB ) && ( s_UpdateDenseOutput[LocalSystemID] == 1 ) )
237 | 	{
238 | 		for (int i=0; i<UD; i++)
239 | 		{
240 | 			GlobalThreadID_Logical = BlockID*(ThreadConfiguration.LogicalThreadsPerBlock+ThreadConfiguration.ThreadPaddingPerBlock) + LocalSystemID*UPS + UnitID;
241 | 			GlobalMemoryID         = GlobalThreadID_Logical + i*ThreadConfiguration.TotalLogicalThreads + s_DenseOutputIndex[LocalSystemID]*SizeOfActualState;
242 | 			
243 | 			d_DenseOutputStates[GlobalMemoryID] = r_ActualState[i];
244 | 		}
245 | 		
246 | 		if ( UnitID == 0 )
247 | 		{
248 | 			GlobalMemoryID = GlobalSystemID + s_DenseOutputIndex[LocalSystemID]*NS;
249 | 			d_DenseOutputTimeInstances[GlobalMemoryID] = s_ActualTime[LocalSystemID];
250 | 		}
251 | 	}
252 | 	__syncthreads();
253 | 	
254 | 	int Launches = SPB / blockDim.x + (SPB % blockDim.x == 0 ? 0 : 1);
255 | 	for (int j=0; j<Launches; j++)
256 | 	{
257 | 		int lsid = threadIdx.x + j*blockDim.x;
258 | 		
259 | 		if ( lsid < SPB )
260 | 		{
261 | 			if ( s_UpdateDenseOutput[lsid] == 1 )
262 | 			{
263 | 				s_DenseOutputIndex[lsid]++;
264 | 				s_NumberOfSkippedStores[lsid] = 0;
265 | 				s_DenseOutputActualTime[lsid] = MPGOS::FMIN(s_ActualTime[lsid]+SolverOptions.DenseOutputMinimumTimeStep, s_TimeDomain[lsid][1]);
266 | 			}
267 | 			
268 | 			if ( ( s_UpdateDenseOutput[lsid] == 0 ) && ( s_UpdateStep[lsid] == 1 ) )
269 | 				s_NumberOfSkippedStores[lsid]++;
270 | 		}
271 | 	}
272 | 	__syncthreads();
273 | }
274 | 
275 | 
276 | template <int UPS, int SPB, int NDO, class Precision>
277 | __forceinline__ __device__ void CoupledSystems_PerBlock_MultipleSystems_SingleBlockLaunch_DenseOutputStorageCondition(\
278 | 			int*       s_EndTimeDomainReached, \
279 | 			int*       s_UserDefinedTermination, \
280 | 			int*       s_UpdateStep, \
281 | 			int*       s_UpdateDenseOutput, \
282 | 			int*       s_DenseOutputIndex, \
283 | 			int*       s_NumberOfSkippedStores, \
284 | 			Precision* s_DenseOutputActualTime, \
285 | 			Precision* s_ActualTime, \
286 | 			Struct_SolverOptions<Precision> SolverOptions)
287 | {
288 | 	int Launches = SPB / blockDim.x + (SPB % blockDim.x == 0 ? 0 : 1);
289 | 	for (int j=0; j<Launches; j++)
290 | 	{
291 | 		int lsid  = threadIdx.x + j*blockDim.x;
292 | 		
293 | 		if ( lsid < SPB )
294 | 		{
295 | 			if ( s_UpdateStep[lsid] == 1 )
296 | 			{
297 | 				if ( ( s_DenseOutputIndex[lsid] < NDO ) && ( s_DenseOutputActualTime[lsid] < s_ActualTime[lsid] ) && ( s_NumberOfSkippedStores[lsid] >= (SolverOptions.DenseOutputSaveFrequency-1) ) )
298 | 					s_UpdateDenseOutput[lsid] = 1;
299 | 				else
300 | 					s_UpdateDenseOutput[lsid] = 0;
301 | 				
302 | 				if ( ( s_DenseOutputIndex[lsid] < NDO ) && ( ( s_EndTimeDomainReached[lsid] == 1 ) || ( s_UserDefinedTermination[lsid] == 1 ) ) )
303 | 					s_UpdateDenseOutput[lsid] = 1;
304 | 			} else
305 | 				s_UpdateDenseOutput[lsid] = 0;
306 | 		}
307 | 	}
308 | 	__syncthreads();
309 | }
310 | 
311 | 
312 | // SSSBL ----------------------------------------------------------------------
313 | template <int NS, int UPS, int UD, class Precision>
314 | __forceinline__ __device__ void CoupledSystems_PerBlock_SingleSystem_SingleBlockLaunch_StoreDenseOutput(\
315 | 			const int  LocalThreadID, \
316 | 			const int  GlobalThreadID, \
317 | 			const int  GlobalSystemID, \
318 | 			int        s_UpdateStep, \
319 | 			int        s_UpdateDenseOutput, \
320 | 			int&       s_DenseOutputIndex, \
321 | 			int&       s_NumberOfSkippedStores, \
322 | 			Precision* d_DenseOutputTimeInstances, \
323 | 			Precision* d_DenseOutputStates, \
324 | 			Precision& s_DenseOutputActualTime, \
325 | 			Precision  s_ActualTime, \
326 | 			Precision  r_ActualState[UD], \
327 | 			Precision  s_TimeDomain[2], \
328 | 			Struct_ThreadConfiguration ThreadConfiguration, \
329 | 			Struct_SolverOptions<Precision> SolverOptions)
330 | {
331 | 	int GlobalMemoryID;
332 | 	
333 | 	int SizeOfActualState = ThreadConfiguration.TotalLogicalThreads*UD;
334 | 	
335 | 	if ( ( LocalThreadID < UPS ) && ( s_UpdateDenseOutput == 1 ) )
336 | 	{
337 | 		for (int i=0; i<UD; i++)
338 | 		{
339 | 			GlobalMemoryID = GlobalThreadID + i*ThreadConfiguration.TotalLogicalThreads + s_DenseOutputIndex*SizeOfActualState;
340 | 			
341 | 			d_DenseOutputStates[GlobalMemoryID] = r_ActualState[i];
342 | 		}
343 | 		
344 | 		if ( LocalThreadID == 0 )
345 | 		{
346 | 			GlobalMemoryID = GlobalSystemID + s_DenseOutputIndex*NS;
347 | 			d_DenseOutputTimeInstances[GlobalMemoryID] = s_ActualTime;
348 | 		}
349 | 	}
350 | 	__syncthreads();
351 | 	
352 | 	if ( threadIdx.x == 0 )
353 | 	{
354 | 		if ( s_UpdateDenseOutput == 1 )
355 | 		{
356 | 			s_DenseOutputIndex++;
357 | 			s_NumberOfSkippedStores = 0;
358 | 			s_DenseOutputActualTime = MPGOS::FMIN(s_ActualTime+SolverOptions.DenseOutputMinimumTimeStep, s_TimeDomain[1]);
359 | 		}
360 | 		
361 | 		if ( ( s_UpdateDenseOutput == 0 ) && ( s_UpdateStep == 1 ) )
362 | 			s_NumberOfSkippedStores++;
363 | 	}
364 | 	__syncthreads();
365 | }
366 | 
367 | 
368 | template <int NDO, class Precision>
369 | __forceinline__ __device__ void CoupledSystems_PerBlock_SingleSystem_SingleBlockLaunch_DenseOutputStorageCondition(\
370 | 			int       s_EndTimeDomainReached, \
371 | 			int       s_UserDefinedTermination, \
372 | 			int       s_UpdateStep, \
373 | 			int&      s_UpdateDenseOutput, \
374 | 			int       s_DenseOutputIndex, \
375 | 			int       s_NumberOfSkippedStores, \
376 | 			Precision s_DenseOutputActualTime, \
377 | 			Precision s_ActualTime, \
378 | 			Struct_SolverOptions<Precision> SolverOptions)
379 | {
380 | 	if ( threadIdx.x == 0 )
381 | 	{
382 | 		if ( s_UpdateStep == 1 )
383 | 		{
384 | 			if ( ( s_DenseOutputIndex < NDO ) && ( s_DenseOutputActualTime < s_ActualTime ) && ( s_NumberOfSkippedStores >= (SolverOptions.DenseOutputSaveFrequency-1) ) )
385 | 				s_UpdateDenseOutput = 1;
386 | 			else
387 | 				s_UpdateDenseOutput = 0;
388 | 			
389 | 			if ( ( s_DenseOutputIndex < NDO ) && ( ( s_EndTimeDomainReached == 1 ) || ( s_UserDefinedTermination == 1 ) ) )
390 | 				s_UpdateDenseOutput = 1;
391 | 		} else
392 | 			s_UpdateDenseOutput = 0;
393 | 	}
394 | 	__syncthreads();
395 | }
396 | 
397 | #endif


--------------------------------------------------------------------------------
/GPU_ODE_MPGOS/SourceCodes/TMP.cuh:
--------------------------------------------------------------------------------
  1 | 	// SINGLE SYSTEM PERTHREAD
  2 | 	
  3 | 	// Test shared memory
  4 | 	if ( tid == 0 )
  5 | 	{
  6 | 		for (int i=0; i<SD; i++)
  7 | 			printf("Comp: %d, ATOL: %+6.3e, RTOL: %+6.3e \n", i, s_AbsoluteTolerance[i], s_RelativeTolerance[i]);
  8 | 	}
  9 | 	
 10 | 	if ( tid == 0 )
 11 | 	{
 12 | 		for (int i=0; i<NE; i++)
 13 | 			printf("Comp: %d, ETOL: %+6.3e, EDIR: %d \n", i, s_EventTolerance[i], s_EventDirection[i]);
 14 | 	}
 15 | 	
 16 | 	if ( tid == 0 )
 17 | 	{
 18 | 		for (int i=0; i<NSP; i++)
 19 | 			printf("Comp: %d, SPAR: %+6.3e \n", i, gs_SharedParameters[i]);
 20 | 	}
 21 | 	
 22 | 		if ( tid == 0 )
 23 | 	{
 24 | 		for (int i=0; i<NISP; i++)
 25 | 			printf("Comp: %d, ISPAR: %d \n", i, gs_IntegerSharedParameters[i]);
 26 | 	}
 27 | 	
 28 | 		// Test registers
 29 | 		if ( tid == 0 )
 30 | 		{
 31 | 			for (int i=0; i<2; i++)
 32 | 				printf("Comp: %d, r_TimeDomain: %+6.3e \n", i, r_TimeDomain[i]);
 33 | 		}
 34 | 		
 35 | 		if ( tid == 0 )
 36 | 		{
 37 | 			for (int i=0; i<SD; i++)
 38 | 				printf("Comp: %d, r_ActualState: %+6.3e \n", i, r_ActualState[i]);
 39 | 		}
 40 | 		
 41 | 		if ( tid == 0 )
 42 | 		{
 43 | 			for (int i=0; i<NCP; i++)
 44 | 				printf("Comp: %d, r_ControlParameters: %+6.3e \n", i, r_ControlParameters[i]);
 45 | 		}
 46 | 		
 47 | 		if ( tid == 0 )
 48 | 		{
 49 | 			for (int i=0; i<NA; i++)
 50 | 				printf("Comp: %d, r_Accessories: %+6.3e \n", i, r_Accessories[i]);
 51 | 		}
 52 | 		
 53 | 		if ( tid == 0 )
 54 | 		{
 55 | 			for (int i=0; i<NIA; i++)
 56 | 				printf("Comp: %d, r_IntegerAccessories: %d \n", i, r_IntegerAccessories[i]);
 57 | 		}
 58 | 		
 59 | 		if ( tid == 0 )
 60 | 		{
 61 | 			printf("r_ActualTime            : %+6.3e \n", r_ActualTime);
 62 | 			printf("r_TimeStep              : %+6.3e \n", r_TimeStep);
 63 | 			printf("r_NewTimeStep           : %+6.3e \n", r_NewTimeStep);
 64 | 			printf("r_DenseOutputIndex      : %d     \n", r_DenseOutputIndex);
 65 | 			printf("r_DenseOutputActualTime : %+6.3e \n", r_DenseOutputActualTime);
 66 | 			printf("r_UpdateDenseOutput     : %d     \n", r_UpdateDenseOutput);
 67 | 			printf("r_NumberOfSkippedStores : %d     \n", r_NumberOfSkippedStores);
 68 | 			printf("r_TerminateSimulation   : %d     \n", r_TerminateSimulation);
 69 | 			printf("r_UserDefinedTermination: %d     \n", r_UserDefinedTermination);
 70 | 		}
 71 | 	
 72 | 	
 73 | 	
 74 | 	
 75 | 	
 76 | 	// COUPLED SYSTEMs PERBLOCK
 77 | 	
 78 | 	// Testing thread management
 79 | 	for (int BL=0; BL<NumberOfBlockLaunches; BL++)
 80 | 	{
 81 | 		bool LimitReached;
 82 | 		LocalThreadID_Logical  = LocalThreadID_GPU + BL*blockDim.x;
 83 | 		GlobalThreadID_Logical = LocalThreadID_Logical + BlockID*TotalLogicalThreadsPerBlock;
 84 | 		
 85 | 		LocalSystemID  = LocalThreadID_Logical / UPS;
 86 | 		UnitID         = LocalThreadID_Logical % UPS;
 87 | 		GlobalSystemID = LocalSystemID + BlockID*SPB;
 88 | 		
 89 | 		if ( ( LocalSystemID >= SPB ) || ( GlobalSystemID >= NS ) || ( GlobalSystemID >= SolverOptions.ActiveSystems ) )
 90 | 			LimitReached = 1;
 91 | 		else
 92 | 			LimitReached = 0;
 93 | 		
 94 | 		printf("GlbTID_Log: %d, GlbTID_GPU: %d, BlockID: %d, LocTID_Log: %d, LocTID_GPU: %d, GlbSID: %d, LocSID: %d, UnitID: %d, LIMIT: %d \n", \
 95 | 		GlobalThreadID_Logical, GlobalThreadID_GPU, BlockID, LocalThreadID_Logical, LocalThreadID_GPU, GlobalSystemID, LocalSystemID, UnitID, LimitReached);
 96 | 	}
 97 | 	
 98 | 	// Testing block scope shared memory variables
 99 | 	if ( ( threadIdx.x == 0 ) )
100 | 	{
101 | 		printf("Block ID: %d, s_TerminatedSystemsPerBlock: %d \n", blockIdx.x, s_TerminatedSystemsPerBlock);
102 | 		printf("Block ID: %d, s_TSS[0]: %d, s_TSS[1]: %d, s_TSS[2]: %d \n", blockIdx.x, s_TerminateSystemScope[0], s_TerminateSystemScope[1], s_TerminateSystemScope[2]);
103 | 	}
104 | 	
105 | 	// Testing global shared memory variables
106 | 	if ( ( blockIdx.x == 0 ) && ( threadIdx.x == 0 ) )
107 | 	{
108 | 		for (int i=0; i<NC; i++)
109 | 			printf("Serial number: %d, s_CouplingIndex: %d \n", i, s_CouplingIndex[i]);
110 | 		
111 | 		for (int i=0; i<NGP; i++)
112 | 			printf("Serial number: %d, gs_GlobalParameters: %6.3e \n", i, gs_GlobalParameters[i]);
113 | 	}
114 | 	
115 | 	// Testing shared memory management
116 | 	if ( threadIdx.x == 0 )
117 | 	{
118 | 		/*for (int i=0; i<SPB; i++) // No limit check is necessary due to SYSTEM PADDING
119 | 			printf("System number: %d, TimeDomain[0]: %6.3e, TimeDomain[1]: %6.3e \n", i, s_TimeDomain[i][0], s_TimeDomain[i][1]);
120 | 		
121 | 		for (int i=0; i<SPB; i++)
122 | 			printf("System number: %d, SystemParameters[0]: %6.3e \n", i, s_SystemParameters[i][0]);
123 | 		
124 | 		for (int i=0; i<SPB; i++)
125 | 			printf("System number: %d, SystemAccessories[0]: %6.3e, SystemAccessories[1]: %6.3e, SystemAccessories[2]: %6.3e \n", i, s_SystemAccessories[i][0], s_SystemAccessories[i][1], s_SystemAccessories[i][2]);
126 | 		
127 | 		for (int i=0; i<SPB; i++)
128 | 			printf("System number: %d, IntegerSystemAccessories[0]: %d, IntegerSystemAccessories[1]: %d, IntegerSystemAccessories[2]: %d, IntegerSystemAccessories[3]: %d \n", i, s_IntegerSystemAccessories[i][0], s_IntegerSystemAccessories[i][1], s_IntegerSystemAccessories[i][2], s_IntegerSystemAccessories[i][3]);
129 | 	
130 | 		for (int i=0; i<NGP; i++)
131 | 			printf("Serial number: %d, gs_GlobalParameters: %6.3e \n", i, gs_GlobalParameters[i]);*/
132 | 		
133 | 		//for (int i=0; i<SPB; i++)
134 | 		//	printf("BlockID: %d, Global system number: %d, Local system number: %d, gs_IntegerGlobalParameters[%d]: %d \n", BlockID, i + BlockID*SPB, i, 5, gs_IntegerGlobalParameters[5]);
135 | 		
136 | 		//for (int i=0; i<SPB; i++)
137 | 		//	printf("BlockID: %d, Global system number: %d, Local system number: %d, s_RelativeTolerance[0]: %6.3e, s_RelativeTolerance[1]: %6.3e \n", BlockID, i + BlockID*SPB, i, s_RelativeTolerance[0], s_RelativeTolerance[1]);
138 | 		
139 | 		//for (int i=0; i<SPB; i++)
140 | 		//	printf("BlockID: %d, Global system number: %d, Local system number: %d, s_AbsoluteTolerance[0]: %6.3e, s_AbsoluteTolerance[1]: %6.3e \n", BlockID, i + BlockID*SPB, i, s_AbsoluteTolerance[0], s_AbsoluteTolerance[1]);
141 | 		
142 | 		//for (int i=0; i<SPB; i++)
143 | 		//	printf("BlockID: %d, Global system number: %d, Local system number: %d, s_EventTolerance[0]: %6.3e, s_EventTolerance[1]: %6.3e, s_EventTolerance[2]: %6.3e \n", BlockID, i + BlockID*SPB, i, s_EventTolerance[0], s_EventTolerance[1], s_EventTolerance[2]);
144 | 		
145 | 		//for (int i=0; i<SPB; i++)
146 | 		//	printf("BlockID: %d, Global system number: %d, Local system number: %d, s_EventStopCounter[0]: %d, s_EventStopCounter[1]: %d, s_EventStopCounter[2]: %d \n", BlockID, i + BlockID*SPB, i, s_EventDirection[0], s_EventDirection[1], s_EventDirection[2]);
147 | 		
148 | 		//for (int i=0; i<SPB; i++)
149 | 		//	printf("BlockID: %d, Global system number: %d, Local system number: %d, s_EventStopCounter[0]: %d, s_EventStopCounter[1]: %d, s_EventStopCounter[2]: %d \n", BlockID, i + BlockID*SPB, i, s_EventStopCounter[0], s_EventStopCounter[1], s_EventStopCounter[2]);
150 | 		
151 | 		//for (int i=0; i<SPB; i++)
152 | 		//	printf("BlockID: %d, Global system number: %d, Local system number: %d, s_CouplingStrength[0]: %6.3e, s_CouplingStrength[1]: %6.3e \n", BlockID, i + BlockID*SPB, i, s_CouplingStrength[i][0], s_CouplingStrength[i][1]);
153 | 		
154 | 		//for (int i=0; i<SPB; i++)
155 | 		//	printf("BlockID: %d, Global system number: %d, Local system number: %d, s_DenseOutputIndex: %d \n", BlockID, i + BlockID*SPB, i, s_DenseOutputIndex[i]);
156 | 	}
157 | 	
158 | 	// Testing coupling matrices
159 | 	if ( ( BlockID == 0 ) && ( threadIdx.x == 0 ) )
160 | 	{
161 | 		int SerialNumber = 0;
162 | 		int MemoryShift  = SerialNumber*SharedMemoryUsage.SingleCouplingMatrixSize;
163 | 		
164 | 		unsigned int IsGlobal = __isGlobal(gs_CouplingMatrix);
165 | 		//unsigned int IsShared = __isShared(gs_CouplingMatrix);
166 | 		printf("Is couplin matrix in global memory: %d \n", IsGlobal);
167 | 		//printf("Is couplin matrix in shared memory: %d \n", IsShared);
168 | 		
169 | 		// Full irregular
170 | 		if ( ( CCI == 0 ) && ( CBW == 0 ) )
171 | 		{
172 | 			int idx;
173 | 			for (int row=0; row<UPS; row++)
174 | 			{
175 | 				for (int col=0; col<UPS; col++)
176 | 				{
177 | 					idx = row + col*UPS + MemoryShift;
178 | 					printf("%6.3e ", gs_CouplingMatrix[idx]);
179 | 				}
180 | 				printf("\n");
181 | 			}
182 | 		}
183 | 		
184 | 		// Diagonal (including the circular extensions)
185 | 		if ( ( CCI == 0 ) && ( CBW > 0  ) ) 
186 | 		{
187 | 			int idx;
188 | 			for (int row=0; row<UPS; row++)
189 | 			{
190 | 				for (int col=0; col<(2*CBW+1); col++)
191 | 				{
192 | 					idx = row + col*UPS + MemoryShift;
193 | 					printf("%6.3e ", gs_CouplingMatrix[idx]);
194 | 				}
195 | 				printf("\n");
196 | 			}
197 | 		}
198 | 		
199 | 		// Circularly diagonal
200 | 		if ( ( CCI == 1 ) && ( CBW > 0  ) )
201 | 		{
202 | 			int idx;
203 | 			for (int col=0; col<(2*CBW+1); col++)
204 | 			{
205 | 				idx = col + MemoryShift;
206 | 				printf("%6.3e ", gs_CouplingMatrix[idx]);
207 | 			}
208 | 			printf("\n");
209 | 		}
210 | 		
211 | 		// Full circular
212 | 		if ( ( CCI == 1 ) && ( CBW == 0 ) )
213 | 		{
214 | 			int idx;
215 | 			for (int col=0; col<UPS; col++)
216 | 			{
217 | 				idx = col + MemoryShift;
218 | 				printf("%6.3e ", gs_CouplingMatrix[idx]);
219 | 			}
220 | 			printf("\n");
221 | 		}
222 | 	}
223 | 	
224 | 	
225 | 	// Testing register memory management
226 | 	if ( ( BlockID == 2 ) && ( threadIdx.x == 0 ) )
227 | 	{
228 | 		/*for (int BL=0; BL<NumberOfBlockLaunches; BL++)
229 | 			printf("BL: %d, X1: %6.3e, X2: %6.3e \n", BL, r_ActualState[BL][0], r_ActualState[BL][1]);
230 | 		
231 | 		for (int BL=0; BL<NumberOfBlockLaunches; BL++)
232 | 			printf("BL: %d, UP1: %6.3e, UP2: %6.3e, UP3: %6.3e, UP4: %6.3e \n", BL, r_UnitParameters[BL][0], r_UnitParameters[BL][1], r_UnitParameters[BL][2], r_UnitParameters[BL][3]);
233 | 		
234 | 		for (int BL=0; BL<NumberOfBlockLaunches; BL++)
235 | 			printf("BL: %d, UA1: %6.3e \n", BL, r_UnitAccessories[BL][0]);
236 | 		
237 | 		for (int BL=0; BL<NumberOfBlockLaunches; BL++)
238 | 			printf("BL: %d, iUA1: %d, iUA2: %d \n", BL, r_IntegerUnitAccessories[BL][0], r_IntegerUnitAccessories[BL][1]);*/
239 | 	}
240 | 	
241 | 	// Testing shared memory splitting
242 | 	/*Precision* pr_ActualState = &r_ActualState[0][0];
243 | 	if ( ( BlockID == 0 ) && ( threadIdx.x == 4 ) )
244 | 	{
245 | 		for (int i=0; i<NumberOfBlockLaunches*UD; i++)
246 | 		{
247 | 			printf("p_X: %6.3e \n", pr_ActualState[i]);
248 | 		}
249 | 		printf("\n");
250 | 	}
251 | 	
252 | 	pr_ActualState = &r_ActualState[0][0];
253 | 	if ( ( BlockID == 0 ) && ( threadIdx.x == 4 ) )
254 | 	{
255 | 		for (int i=0; i<UD; i++)
256 | 		{
257 | 			printf("p_X: %6.3e \n", pr_ActualState[i]);
258 | 		}
259 | 		printf("\n");
260 | 	}
261 | 	
262 | 	pr_ActualState = &r_ActualState[1][0];
263 | 	if ( ( BlockID == 0 ) && ( threadIdx.x == 4 ) )
264 | 	{
265 | 		for (int i=0; i<UD; i++)
266 | 		{
267 | 			printf("p_X: %6.3e \n", pr_ActualState[i]);
268 | 		}
269 | 		printf("\n");
270 | 	}*/
271 | 	
272 | 	
273 | 	// Complex test of ODE function output-------------------------------------
274 | 	int GlobalSystemID;
275 | 	int GSID = 1;
276 | 	int UID  = 2;
277 | 	for (int BL=0; BL<NBL; BL++)
278 | 	{
279 | 		int CouplingIndex = 0;
280 | 		
281 | 		LocalThreadID_Logical = LocalThreadID_GPU + BL*blockDim.x;
282 | 		LocalSystemID         = LocalThreadID_Logical / UPS;
283 | 		UnitID                = LocalThreadID_Logical % UPS;
284 | 		GlobalSystemID        = LocalSystemID + blockIdx.x*SPB;
285 | 		
286 | 		// All
287 | 		/*if ( (LocalSystemID<SPB) && (GlobalSystemID<NS) )
288 | 		{
289 | 			printf("BL: %2d, GSID: %2d, LSID: %2d, UID: %2d, X1: %+6.3e, X2: %+6.3e, F1: %+6.3e, F2: %+6.3e, CPT: %+6.3e, CPF: %+6.3e \n", \
290 | 			    BL, GlobalSystemID, LocalSystemID, UnitID, \
291 | 				r_ActualState[BL][0], r_ActualState[BL][1], r_NextState[BL][0], r_NextState[BL][1], \
292 | 				s_CouplingTerms[LocalSystemID][UnitID][CouplingIndex], r_CouplingFactor[BL][CouplingIndex]);
293 | 		}*/
294 | 		
295 | 		// Specific
296 | 		if ( (GlobalSystemID==GSID) && (UnitID==UID) && (LocalSystemID<SPB) )
297 | 		{
298 | 			printf("BL: %2d, GSID: %2d, LSID: %2d, UID: %2d, X1: %+6.3e, X2: %+6.3e, F1: %+6.3e, F2: %+6.3e, CPT: %+6.3e, CPF: %+6.3e \n", \
299 | 			    BL, GlobalSystemID, LocalSystemID, UnitID, \
300 | 				r_ActualState[BL][0], r_ActualState[BL][1], r_NextState[BL][0], r_NextState[BL][1], \
301 | 				s_CouplingTerms[LocalSystemID][UnitID][CouplingIndex], r_CouplingFactor[BL][CouplingIndex]);
302 | 		}
303 | 		
304 | 		// Corresponding coupling terms
305 | 		if ( (GlobalSystemID==GSID) && (LocalSystemID<SPB) )
306 | 		{
307 | 			printf("BL: %2d, GSID: %2d, LSID: %2d, UID: %2d, s_CouplingTerms: %+6.3e \n", \
308 | 			    BL, GlobalSystemID, LocalSystemID, UnitID, \
309 | 				s_CouplingTerms[LocalSystemID][UnitID][CouplingIndex]);
310 | 		}
311 | 	}
312 | 	__syncthreads(); //--------------------------------------------------------
313 | 	
314 | 				// Test coupling value accumulation (embedded in the Matrix-CouplinTerms multiplication)
315 | 				GlobalSystemID = LocalSystemID + blockIdx.x*SPB;
316 | 				if ( (GlobalSystemID==GSID) && (LocalSystemID<SPB) && (UnitID==UID) && (i==0) )
317 | 				{
318 | 					printf("GSID: %2d, LSID: %2d, NC: %2d, Row: %2d, Col: %2d, CouplingMatrix: %+6.3e, CouplingTerms: %+6.3e, CouplingValue: %+6.3e, \n", \
319 | 						LocalSystemID + blockIdx.x*SPB, LocalSystemID, i, Row, Col, gs_CouplingMatrix[idx], s_CouplingTerms[LocalSystemID][Col][i], CouplingValue);
320 | 				}
321 | 	
322 | 	// Test ODE function output after coupling --------------------------------
323 | 	for (int BL=0; BL<NBL; BL++)
324 | 	{
325 | 		int CouplingIndex = 0;
326 | 		
327 | 		LocalThreadID_Logical = LocalThreadID_GPU + BL*blockDim.x;
328 | 		LocalSystemID         = LocalThreadID_Logical / UPS;
329 | 		UnitID                = LocalThreadID_Logical % UPS;
330 | 		GlobalSystemID        = LocalSystemID + blockIdx.x*SPB;
331 | 		
332 | 		// All
333 | 		/*if ( (LocalSystemID<SPB) && (GlobalSystemID<NS) )
334 | 		{
335 | 			printf("BL: %2d, GSID: %2d, LSID: %2d, UID: %2d, F2: %+6.3e, CPS: %+6.3e, CPF: %+6.3e \n", \
336 | 			    BL, GlobalSystemID, LocalSystemID, UnitID, \
337 | 				r_NextState[BL][1], \
338 | 				s_CouplingStrength[LocalSystemID][CouplingIndex], r_CouplingFactor[BL][CouplingIndex]);
339 | 		}*/
340 | 		
341 | 		// Specific
342 | 		if ( (GlobalSystemID==GSID) && (UnitID==UID) && (LocalSystemID<SPB) )
343 | 		{
344 | 			printf("BL: %2d, GSID: %2d, LSID: %2d, UID: %2d, F2: %+6.3e, CPS: %+6.3e, CPF: %+6.3e \n", \
345 | 			    BL, GlobalSystemID, LocalSystemID, UnitID, \
346 | 				r_NextState[BL][1], \
347 | 				s_CouplingStrength[LocalSystemID][CouplingIndex], r_CouplingFactor[BL][CouplingIndex]);
348 | 		}
349 | 	}
350 | 	__syncthreads(); // -------------------------------------------------------
351 | 	
352 | 	// Test final state -------------------------------------------------------
353 | 	int GSID = 7;
354 | 	int UID  = 5;
355 | 	for (int BL=0; BL<NumberOfBlockLaunches; BL++)
356 | 	{
357 | 		LocalThreadID_Logical = LocalThreadID_GPU + BL*blockDim.x;
358 | 		LocalSystemID         = LocalThreadID_Logical / UPS;
359 | 		UnitID                = LocalThreadID_Logical % UPS;
360 | 		GlobalSystemID        = LocalSystemID + blockIdx.x*SPB;
361 | 		
362 | 		// All
363 | 		if ( (LocalSystemID<SPB) && (GlobalSystemID<NS) )
364 | 		{
365 | 			printf("BL: %2d, GSID: %2d, LSID: %2d, UID: %2d, T: %+6.7e, X1: %+6.7e, X2: %+6.7e \n", \
366 | 			    BL, GlobalSystemID, LocalSystemID, UnitID, s_ActualTime[LocalSystemID], r_ActualState[BL][0], r_ActualState[BL][1]);
367 | 		}
368 | 		
369 | 		// Specific
370 | 		/*if ( (GlobalSystemID==GSID) && (UnitID==UID) && (LocalSystemID<SPB) )
371 | 		{
372 | 			printf("BL: %2d, GSID: %2d, LSID: %2d, UID: %2d, X1: %+6.3e, X2: %+6.3e, X1new: %+6.7e, X2new: %+6.7e \n", \
373 | 			    BL, GlobalSystemID, LocalSystemID, UnitID, \
374 | 				r_ActualState[BL][0], r_ActualState[BL][1], r_NextState[BL][0], r_NextState[BL][1]);
375 | 		}*/
376 | 	}
377 | 	__syncthreads(); //--------------------------------------------------------
378 | 	
379 | 	// Test calculation of global minimum
380 | 	for (int BL=0; BL<NBL; BL++)
381 | 	{
382 | 		LocalThreadID_Logical = LocalThreadID_GPU + BL*blockDim.x;
383 | 		LocalSystemID         = LocalThreadID_Logical / UPS;
384 | 		UnitID                = LocalThreadID_Logical % UPS;
385 | 		GlobalSystemID        = LocalSystemID + BlockID*SPB;
386 | 		
387 | 		if ( ( LocalSystemID < SPB ) && ( GlobalSystemID == 2 ) && ( UnitID == 0 ) )
388 | 		{
389 | 			printf("GSID: %d, GLB_MIN_RERR: %+6.3e, GLB_UPD: %d \n", GlobalSystemID, s_RelativeError[LocalSystemID], s_UpdateStep[LocalSystemID]);
390 | 			printf("s_TimeStepMultiplicator: %+6.3e, s_Timestep, %+6.3e, s_NewTimeStep: %+6.3e \n", s_TimeStepMultiplicator[LocalSystemID], s_TimeStep[LocalSystemID], s_NewTimeStep[LocalSystemID]);
391 | 		}
392 | 	}
393 | 	__syncthreads();
394 | 	
395 | 				// Test reduction operation in error handling
396 | 				if ( GlobalSystemID == 2 )
397 | 					printf("GSID: %d, UID: %d, Comp: %d, RERR: %+6.3e, ERR: %+6.3e, TOL: %+6.3e, UPD: %d \n", GlobalSystemID, UnitID, i, r_ErrorTolerance / r_Error[BL][i], r_Error[BL][i], r_ErrorTolerance, r_UpdateStep);
398 | 	
399 | 	// Testing termination in update process
400 | 	printf("A system is terminated, Block ID: %d, SysID: %d, s_TSS[0]: %d, s_TSS[1]: %d, s_TSS[2]: %d \n", blockIdx.x, LocalSystemID, s_TerminateSystemScope[0], s_TerminateSystemScope[1], s_TerminateSystemScope[2]);
401 | 	printf("Block ID: %d, SysID: %d, IsFinite %d \n", blockIdx.x, LocalSystemID, s_IsFinite[0]);
402 | 	
403 | 	
404 | 	
405 | 	// Test NC padding
406 | 	if ( GlobalThreadID_GPU == 0 )
407 | 	{
408 | 		printf("NC: %d, NCmod: %d, NCpadding: %d, power of 2: %d\n", NC, NCmod, NCpadding, IsPowerOfTwo);
409 | 	}


--------------------------------------------------------------------------------