├── .gitignore
├── .gitmodules
├── Makefile
├── README.md
├── examples
    ├── track_iiwa_pcg.cu
    ├── track_iiwa_qdldl.cu
    └── trajfiles
    │   ├── 0_0_eepos.traj
    │   ├── 0_0_traj.csv
    │   ├── 0_1_traj.csv
    │   ├── 0_2_traj.csv
    │   ├── 0_3_traj.csv
    │   ├── 0_4_traj.csv
    │   ├── 1_0_traj.csv
    │   ├── 1_2_traj.csv
    │   ├── 1_3_traj.csv
    │   ├── 1_4_traj.csv
    │   ├── 2_0_traj.csv
    │   ├── 2_1_traj.csv
    │   ├── 2_3_traj.csv
    │   ├── 2_4_traj.csv
    │   ├── 3_0_traj.csv
    │   ├── 3_1_traj.csv
    │   ├── 3_2_traj.csv
    │   ├── 3_4_traj.csv
    │   ├── 4_0_traj.csv
    │   ├── 4_1_traj.csv
    │   ├── 4_2_traj.csv
    │   └── 4_3_traj.csv
└── include
    ├── common
        ├── dz.cuh
        ├── integrator.cuh
        ├── kkt.cuh
        ├── merit.cuh
        └── settings.cuh
    ├── dynamics
        ├── iiwa
        │   ├── iiwa_eepos_grid.cuh
        │   ├── iiwa_eepos_plant.cuh
        │   ├── iiwa_grid.cuh
        │   └── iiwa_plant.cuh
        └── rbd_plant.cuh
    ├── mpcsim.cuh
    ├── pcg
        ├── linsys_setup.cuh
        └── sqp.cuh
    ├── qdldl
        ├── linsys_setup.cuh
        └── sqp.cuh
    └── utils
        ├── csr.cuh
        ├── experiment.cuh
        └── matrix.cuh


/.gitignore:
--------------------------------------------------------------------------------
1 | # Compiled source #
2 | ###################
3 | *.exe
4 | *.o
5 | *.so


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "GBD-PCG"]
 2 | 	path = GBD-PCG
 3 | 	url = git@github.com:A2R-Lab/GBD-PCG.git
 4 | [submodule "qdldl"]
 5 | 	path = qdldl
 6 | 	url = https://github.com/osqp/qdldl.git
 7 | [submodule "GLASS"]
 8 | 	path = GLASS
 9 | 	url = git@github.com:A2R-lab/GLASS.git
10 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile
 2 | 
 3 | # Compiler and compiler flags
 4 | NVCC = nvcc
 5 | CFLAGS = --compiler-options -Wall  -O3 -Iinclude -Iinclude/common -IGLASS  -IGBD-PCG/include  -lqdldl  -Iqdldl/include -Lqdldl/build/out -lcublas
 6 | 
 7 | 
 8 | examples: examples/pcg.exe examples/qdldl.exe
 9 | 
10 | examples/pcg.exe:
11 | 	$(NVCC) $(CFLAGS) examples/track_iiwa_pcg.cu -o examples/pcg.exe
12 | examples/qdldl.exe:
13 | 	$(NVCC) $(CFLAGS) -DLINSYS_SOLVE=0 examples/track_iiwa_qdldl.cu -o examples/qdldl.exe
14 | 
15 | build_qdldl:
16 | 	cd qdldl && mkdir -p build && cd build && cmake -DQDLDL_FLOAT=true -DQDLDL_LONG=false .. && cmake --build . && cd ../../
17 | 
18 | clean:
19 | 	rm -f examples/*.exe
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MPCGPU
 2 | 
 3 | Numerical experiments and the open-source solver from the paper ["MPCGPU: Real-Time Nonlinear Model Predictive Control through Preconditioned Conjugate Gradient on the GPU"](https://arxiv.org/abs/2309.08079) 
 4 | 
 5 | ### Building and running examples
 6 | 
 7 | ```
 8 | git clone https://github.com/A2R-Lab/MPCGPU
 9 | cd MPCGPU
10 | git submodule update --init --recursive
11 | make build_qdldl
12 | make examples
13 | mkdir -p tmp/results
14 | ```
15 | Either install the qdldl shared library by running ```cd qdldl/build && make install``` or modify the ```LD_LIBRARY_PATH``` environment variable to include the path to ```MPCGPU/qdldl/build/out```.
16 | 
17 | ```
18 | ./examples/pcg.exe
19 | ./examples/qdldl.exe
20 | ```
21 | 
22 | ### Setting parameters
23 | 
24 | You can set a bunch of parameters in `include/setting.cuh` file. You can also modify these by passing them as
25 | compiler flags. This will overwrite the default values set for these parameters. Please refer to `Makefile` for
26 | an example.
27 | 
28 | ### Other solvers and problems
29 | 
30 | You should be able to replace the underlying linear system solver with your own solver. Please refer to `include/linsys_solvers/qdldl/sqp.cuh` for an example.
31 | 
32 | You should also be able to compile and run it for a different problem that  "Kuka IIWA manipulator". Please refer to `include/dynamics/` folder for an example. We use [GRiD](!https://github.com/robot-acceleration/GRiD)  for computing rigid body dynamics with analytical gradients.
33 | 
34 | ### Citing
35 | To cite this work in your research, please use the following bibtex:
36 | ```
37 | @inproceedings{adabag2024mpcgpu,
38 |   title={MPCGPU: Real-Time Nonlinear Model Predictive Control through Preconditioned Conjugate Gradient on the GPU}, 
39 |   author={Emre Adabag and Miloni Atal and William Gerard and Brian Plancher},
40 |   booktitle={IEEE International Conference on Robotics and Automation (ICRA)},
41 |   address = {Yokohama, Japan},
42 |   month={May.},
43 |   year = {2024}
44 | }
45 | ```
46 | 


--------------------------------------------------------------------------------
/examples/track_iiwa_pcg.cu:
--------------------------------------------------------------------------------
  1 | #include <fstream>
  2 | #include <vector>
  3 | #include <sstream>
  4 | #include <iostream>
  5 | #include <tuple>
  6 | #include <filesystem>
  7 | #include "mpcsim.cuh"
  8 | #include "dynamics/rbd_plant.cuh"
  9 | #include "settings.cuh"
 10 | #include "utils/experiment.cuh"
 11 | #include "gpu_pcg.cuh"
 12 | 
 13 | 
 14 | int main(){
 15 | 
 16 |     constexpr uint32_t state_size = grid::NUM_JOINTS*2;
 17 |     constexpr uint32_t control_size = grid::NUM_JOINTS;
 18 |     constexpr uint32_t knot_points = KNOT_POINTS;
 19 |     const linsys_t timestep = .015625;
 20 | 
 21 |     const uint32_t traj_test_iters = TEST_ITERS;
 22 | 
 23 |     // checks GPU space for pcg
 24 |     checkPcgOccupancy<linsys_t>((void *) pcg<linsys_t, state_size, knot_points>, PCG_NUM_THREADS, state_size, knot_points);    
 25 | 
 26 |     print_test_config();
 27 |     // where to store test results — manually create this directory
 28 |     std::string output_directory_path = "tmp/results/";
 29 | 
 30 |     const uint32_t recorded_states = 5;
 31 |     const uint32_t start_goal_combinations = recorded_states*recorded_states;
 32 | 
 33 |     char eePos_traj_file_name[100];
 34 |     char xu_traj_file_name[100];
 35 | 
 36 |     int start_state, goal_state;
 37 |     linsys_t *d_eePos_traj, *d_xu_traj, *d_xs;
 38 | 
 39 |     for(uint32_t ind = 0; ind < start_goal_combinations; ind++){
 40 | 
 41 |         start_state = ind % recorded_states;
 42 |         goal_state = ind / recorded_states;
 43 |         if(start_state == goal_state && start_state != 0){ continue; }
 44 |         std::cout << "start: " << start_state << " goal: " << goal_state << std::endl;
 45 | 
 46 |         uint32_t num_exit_vals = 5;
 47 |         float pcg_exit_vals[num_exit_vals];
 48 |         if(knot_points==32){
 49 |             pcg_exit_vals[0] = 5e-6;
 50 |             pcg_exit_vals[1] = 7.5e-6;
 51 |             pcg_exit_vals[2] = 5e-6;
 52 |             pcg_exit_vals[3] = 2.5e-6;
 53 |             pcg_exit_vals[4] = 1e-6;
 54 |         }
 55 |         else if(knot_points==64){
 56 |             pcg_exit_vals[0] = 5e-5;
 57 |             pcg_exit_vals[1] = 7.5e-5;
 58 |             pcg_exit_vals[2] = 5e-5;
 59 |             pcg_exit_vals[3] = 2.5e-5;
 60 |             pcg_exit_vals[4] = 1e-5;
 61 |         }
 62 |         else{
 63 |             pcg_exit_vals[0] = 1e-5;
 64 |             pcg_exit_vals[1] = 5e-5;
 65 |             pcg_exit_vals[2] = 1e-4;
 66 |             pcg_exit_vals[3] = 5e-4;
 67 |             pcg_exit_vals[4] = 1e-3;
 68 |         }
 69 | 
 70 | 
 71 |         for (uint32_t pcg_exit_ind = 0; pcg_exit_ind < num_exit_vals; pcg_exit_ind++){
 72 | 
 73 |             float pcg_exit_tol = pcg_exit_vals[pcg_exit_ind];
 74 | 		std::vector<double> linsys_times;
 75 | 		std::vector<uint32_t> sqp_iters;
 76 | 		std::vector<toplevel_return_type> current_results;
 77 | 		std::vector<float> tracking_errs;
 78 | 		std::vector<float> cur_tracking_errs;
 79 | 		double tot_final_tracking_err = 0;
 80 | 
 81 | 		std::string test_output_prefix = output_directory_path + std::to_string(KNOT_POINTS) + "_" + ( (LINSYS_SOLVE == 1) ? "PCG" : "QDLDL") + "_" + std::to_string(pcg_exit_tol);
 82 | 		printf("Logging test results to files with prefix %s \n", test_output_prefix.c_str()); 
 83 | 
 84 | 		for (uint32_t single_traj_test_iter = 0; single_traj_test_iter < traj_test_iters; single_traj_test_iter++){
 85 | 
 86 | 		// read in traj
 87 | 		snprintf(eePos_traj_file_name, sizeof(eePos_traj_file_name), "examples/trajfiles/%d_%d_eepos.traj", start_state, goal_state);
 88 | 		std::vector<std::vector<linsys_t>> eePos_traj2d = readCSVToVecVec<linsys_t>(eePos_traj_file_name);
 89 | 		
 90 | 		snprintf(xu_traj_file_name, sizeof(xu_traj_file_name), "examples/trajfiles/%d_%d_traj.csv", start_state, goal_state);
 91 | 		std::vector<std::vector<linsys_t>> xu_traj2d = readCSVToVecVec<linsys_t>(xu_traj_file_name);
 92 | 		
 93 | 		if(eePos_traj2d.size() < knot_points){std::cout << "precomputed traj length < knotpoints, not implemented\n"; continue; }
 94 | 
 95 | 
 96 | 		std::vector<linsys_t> h_eePos_traj;
 97 | 		for (const auto& vec : eePos_traj2d) {
 98 | 			h_eePos_traj.insert(h_eePos_traj.end(), vec.begin(), vec.end());
 99 | 		}
100 | 		std::vector<linsys_t> h_xu_traj;
101 | 		for (const auto& xu_vec : xu_traj2d) {
102 | 			h_xu_traj.insert(h_xu_traj.end(), xu_vec.begin(), xu_vec.end());
103 | 		}
104 | 
105 | 		gpuErrchk(cudaMalloc(&d_eePos_traj, h_eePos_traj.size()*sizeof(linsys_t)));
106 | 		gpuErrchk(cudaMemcpy(d_eePos_traj, h_eePos_traj.data(), h_eePos_traj.size()*sizeof(linsys_t), cudaMemcpyHostToDevice));
107 | 		
108 | 		gpuErrchk(cudaMalloc(&d_xu_traj, h_xu_traj.size()*sizeof(linsys_t)));
109 | 		gpuErrchk(cudaMemcpy(d_xu_traj, h_xu_traj.data(), h_xu_traj.size()*sizeof(linsys_t), cudaMemcpyHostToDevice));
110 | 		
111 | 		gpuErrchk(cudaMalloc(&d_xs, state_size*sizeof(linsys_t)));
112 | 		gpuErrchk(cudaMemcpy(d_xs, h_xu_traj.data(), state_size*sizeof(linsys_t), cudaMemcpyHostToDevice));
113 | 
114 | 		std::tuple<std::vector<toplevel_return_type>, std::vector<linsys_t>, linsys_t> trackingstats = simulateMPC<linsys_t, toplevel_return_type>(state_size, control_size, knot_points, 
115 | 			static_cast<uint32_t>(eePos_traj2d.size()), timestep, d_eePos_traj, d_xu_traj, d_xs, start_state, goal_state, single_traj_test_iter, pcg_exit_tol, test_output_prefix);
116 | 		
117 | 		current_results = std::get<0>(trackingstats);
118 | 		if (TIME_LINSYS == 1) {
119 | 			linsys_times.insert(linsys_times.end(), current_results.begin(), current_results.end());
120 | 		} else {
121 | 			sqp_iters.insert(sqp_iters.end(), current_results.begin(), current_results.end());
122 | 		}
123 | 
124 | 		cur_tracking_errs = std::get<1>(trackingstats);
125 | 		tracking_errs.insert(tracking_errs.end(), cur_tracking_errs.begin(), cur_tracking_errs.end());
126 | 
127 | 		tot_final_tracking_err += std::get<2>(trackingstats);
128 | 		
129 | 
130 | 
131 | 		gpuErrchk(cudaFree(d_xu_traj));
132 | 		gpuErrchk(cudaFree(d_eePos_traj));
133 | 		gpuErrchk(cudaFree(d_xs));
134 | 		gpuErrchk(cudaPeekAtLastError());
135 | 		
136 | 		}
137 | 
138 | 		std::cout << "Completed at " << getCurrentTimestamp() << std::endl;
139 | 		std::cout << "\nRESULTS*************************************\n";
140 | 		std::cout << "Exit tol: " << pcg_exit_tol << std::endl;
141 | 		std::cout << "\nTracking err";
142 | 		std::string trackingStats = printStats<float>(&tracking_errs, "trackingerr");
143 | 		std::cout << "Average final tracking err: " << tot_final_tracking_err / traj_test_iters << std::endl;
144 | 		std::string linsysOrSqpStats;
145 | 		if (TIME_LINSYS == 1)
146 | 		{
147 | 		std::cout << "\nLinsys times";
148 | 		linsysOrSqpStats = printStats<double>(&linsys_times, "linsystimes");
149 | 		}
150 | 		else
151 | 		{
152 | 		std::cout << "\nSqp iters";
153 | 		linsysOrSqpStats = printStats<uint32_t>(&sqp_iters, "sqpiters");
154 | 		}
155 | 		std::cout << "************************************************\n\n";
156 | 
157 | 		// Specify the CSV file path
158 | 		const std::string csvFilePath = test_output_prefix + "_" + "overall_stats.csv";
159 | 
160 | 		// Open the CSV file for writing
161 | 		std::ofstream csvFile(csvFilePath);
162 | 		if (!csvFile.is_open()) {
163 | 		std::cerr << "Error opening CSV file for writing." << std::endl;
164 | 		return 1;
165 | 		}
166 | 
167 | 		// Write the header row
168 | 		csvFile << "Average,Std Dev, Min, Max, Median, Q1, Q3\n";
169 | 
170 | 		// Write the data rows
171 | 		csvFile << getStatsString(trackingStats) << "\n";
172 | 		csvFile << getStatsString(linsysOrSqpStats) << "\n";
173 | 
174 | 		// Close the CSV file
175 | 		csvFile.close();
176 | 	}
177 |         break;
178 |     }
179 | 
180 | 
181 | 
182 | 
183 |     return 0;
184 | }
185 | 


--------------------------------------------------------------------------------
/examples/track_iiwa_qdldl.cu:
--------------------------------------------------------------------------------
  1 | #include <fstream>
  2 | #include <vector>
  3 | #include <sstream>
  4 | #include <iostream>
  5 | #include <tuple>
  6 | #include <filesystem>
  7 | #include "qdldl.h"
  8 | #include "mpcsim.cuh"
  9 | #include "dynamics/rbd_plant.cuh"
 10 | #include "settings.cuh"
 11 | #include "utils/experiment.cuh"
 12 | 
 13 | int main(){
 14 | 
 15 |     constexpr uint32_t state_size = grid::NUM_JOINTS*2;
 16 |     constexpr uint32_t control_size = grid::NUM_JOINTS;
 17 |     constexpr uint32_t knot_points = KNOT_POINTS;
 18 |     const linsys_t timestep = .015625;
 19 | 
 20 |     const uint32_t traj_test_iters = TEST_ITERS;
 21 | 
 22 |     if(!std::is_same<QDLDL_float, linsys_t>::value){ std::cout << "GBD-PCG QDLDL type mismatch" << std::endl; exit(1); }
 23 | 
 24 |     print_test_config();
 25 |      // where to store test results — manually create this directory
 26 |     std::string output_directory_path = "tmp/results/";
 27 | 
 28 |     const uint32_t recorded_states = 5;
 29 |     const uint32_t start_goal_combinations = recorded_states*recorded_states;
 30 | 
 31 |     char eePos_traj_file_name[100];
 32 |     char xu_traj_file_name[100];
 33 | 
 34 |     int start_state, goal_state;
 35 |     linsys_t *d_eePos_traj, *d_xu_traj, *d_xs;
 36 | 
 37 |     for(uint32_t ind = 0; ind < start_goal_combinations; ind++){
 38 | 
 39 |         start_state = ind % recorded_states;
 40 |         goal_state = ind / recorded_states;
 41 |         if(start_state == goal_state && start_state != 0){ continue; }
 42 |         std::cout << "start: " << start_state << " goal: " << goal_state << std::endl;
 43 | 
 44 | 		float linsys_exit_tol = -1;
 45 | 		std::vector<double> linsys_times;
 46 | 		std::vector<uint32_t> sqp_iters;
 47 | 		std::vector<toplevel_return_type> current_results;
 48 | 		std::vector<float> tracking_errs;
 49 | 		std::vector<float> cur_tracking_errs;
 50 | 		double tot_final_tracking_err = 0;
 51 | 
 52 | 		std::string test_output_prefix = output_directory_path  + std::to_string(KNOT_POINTS) + "_" + ( (LINSYS_SOLVE == 1) ? "PCG" : "QDLDL");
 53 | 		printf("Logging test results to files with prefix %s \n", test_output_prefix.c_str()); 
 54 | 
 55 | 		for (uint32_t single_traj_test_iter = 0; single_traj_test_iter < traj_test_iters; single_traj_test_iter++){
 56 | 
 57 | 			// read in traj
 58 | 			snprintf(eePos_traj_file_name, sizeof(eePos_traj_file_name), "examples/trajfiles/%d_%d_eepos.traj", start_state, goal_state);
 59 | 			std::vector<std::vector<linsys_t>> eePos_traj2d = readCSVToVecVec<linsys_t>(eePos_traj_file_name);
 60 | 			
 61 | 			snprintf(xu_traj_file_name, sizeof(xu_traj_file_name), "examples/trajfiles/%d_%d_traj.csv", start_state, goal_state);
 62 | 			std::vector<std::vector<linsys_t>> xu_traj2d = readCSVToVecVec<linsys_t>(xu_traj_file_name);
 63 | 			
 64 | 			if(eePos_traj2d.size() < knot_points){std::cout << "precomputed traj length < knotpoints, not implemented\n"; continue; }
 65 | 
 66 | 
 67 | 			std::vector<linsys_t> h_eePos_traj;
 68 | 			for (const auto& vec : eePos_traj2d) {
 69 | 				h_eePos_traj.insert(h_eePos_traj.end(), vec.begin(), vec.end());
 70 | 			}
 71 | 			std::vector<linsys_t> h_xu_traj;
 72 | 			for (const auto& xu_vec : xu_traj2d) {
 73 | 				h_xu_traj.insert(h_xu_traj.end(), xu_vec.begin(), xu_vec.end());
 74 | 			}
 75 | 
 76 | 			gpuErrchk(cudaMalloc(&d_eePos_traj, h_eePos_traj.size()*sizeof(linsys_t)));
 77 | 			gpuErrchk(cudaMemcpy(d_eePos_traj, h_eePos_traj.data(), h_eePos_traj.size()*sizeof(linsys_t), cudaMemcpyHostToDevice));
 78 | 			
 79 | 			gpuErrchk(cudaMalloc(&d_xu_traj, h_xu_traj.size()*sizeof(linsys_t)));
 80 | 			gpuErrchk(cudaMemcpy(d_xu_traj, h_xu_traj.data(), h_xu_traj.size()*sizeof(linsys_t), cudaMemcpyHostToDevice));
 81 | 			
 82 | 			gpuErrchk(cudaMalloc(&d_xs, state_size*sizeof(linsys_t)));
 83 | 			gpuErrchk(cudaMemcpy(d_xs, h_xu_traj.data(), state_size*sizeof(linsys_t), cudaMemcpyHostToDevice));
 84 | 			
 85 | 			std::tuple<std::vector<toplevel_return_type>, std::vector<linsys_t>, linsys_t> trackingstats = simulateMPC<linsys_t, toplevel_return_type>(state_size, control_size, knot_points, 
 86 | 				static_cast<uint32_t>(eePos_traj2d.size()), timestep, d_eePos_traj, d_xu_traj, d_xs, start_state, goal_state, single_traj_test_iter, linsys_exit_tol, test_output_prefix);
 87 | 			
 88 | 			current_results = std::get<0>(trackingstats);
 89 | 			if (TIME_LINSYS == 1) {
 90 | 				linsys_times.insert(linsys_times.end(), current_results.begin(), current_results.end());
 91 | 			} else {
 92 | 				sqp_iters.insert(sqp_iters.end(), current_results.begin(), current_results.end());
 93 | 			}
 94 | 
 95 | 			cur_tracking_errs = std::get<1>(trackingstats);
 96 | 			tracking_errs.insert(tracking_errs.end(), cur_tracking_errs.begin(), cur_tracking_errs.end());
 97 | 
 98 | 			tot_final_tracking_err += std::get<2>(trackingstats);
 99 | 			
100 | 
101 | 
102 | 			gpuErrchk(cudaFree(d_xu_traj));
103 | 			gpuErrchk(cudaFree(d_eePos_traj));
104 | 			gpuErrchk(cudaFree(d_xs));
105 | 			gpuErrchk(cudaPeekAtLastError());
106 | 			
107 | 		}
108 | 
109 | 		std::cout << "Completed at " << getCurrentTimestamp() << std::endl;
110 | 		std::cout << "\nRESULTS*************************************\n";
111 | 		std::cout << "exit tol: " << linsys_exit_tol << std::endl;
112 | 		std::cout << "\nTracking err";
113 | 		std::string trackingStats = printStats<float>(&tracking_errs, "trackingerr");
114 | 		std::cout << "Average final tracking err: " << tot_final_tracking_err / traj_test_iters << std::endl;
115 | 		std::string linsysOrSqpStats;
116 | 		if (TIME_LINSYS == 1)
117 | 		{
118 | 		std::cout << "\nLinsys times";
119 | 		linsysOrSqpStats = printStats<double>(&linsys_times, "linsystimes");
120 | 		}
121 | 		else
122 | 		{
123 | 		std::cout << "\nSqp iters";
124 | 		linsysOrSqpStats = printStats<uint32_t>(&sqp_iters, "sqpiters");
125 | 		}
126 | 		std::cout << "************************************************\n\n";
127 | 
128 | 
129 | 		// Specify the CSV file path
130 | 		const std::string csvFilePath = test_output_prefix + "_" + "overall_stats.csv";
131 | 
132 | 		// Open the CSV file for writing
133 | 		std::ofstream csvFile(csvFilePath);
134 | 		if (!csvFile.is_open()) {
135 | 			std::cerr << "Error opening CSV file for writing." << std::endl;
136 | 			return 1;
137 | 		}
138 | 
139 | 		// Write the header row
140 | 		csvFile << "Average,Std Dev, Min, Max, Median, Q1, Q3\n";
141 | 
142 | 		// Write the data rows
143 | 		csvFile << getStatsString(trackingStats) << "\n";
144 | 		csvFile << getStatsString(linsysOrSqpStats) << "\n";
145 | 
146 | 		// Close the CSV file
147 | 		csvFile.close();
148 | 
149 |         break;
150 |     }
151 | 
152 | 
153 | 
154 | 
155 |     return 0;
156 | }
157 | 


--------------------------------------------------------------------------------
/examples/trajfiles/3_4_traj.csv:
--------------------------------------------------------------------------------
  1 | 0.2099999934,0.4600000083,-0.8799999952,-0.4099999964,0.2099999934,0.8700000048,1.080000043,0.009999999776,0.01999999955,-0.02999999933,0.03999999911,-0.05000000075,0,0.009999999776,0,0,0,0,0,0,0
  2 | 0.2099999934,0.4600000083,-0.8799999952,-0.4099999964,0.2099999934,0.8700000048,1.080000043,0.009999999776,0.01999999955,-0.02999999933,0.03999999911,-0.05000000075,0,0.009999999776,9.991406441,7.832764149,0.5569088459,-25.84409904,-9.808384895,0.01617191359,-1.959708691
  3 | 0.210156247,0.4603125155,-0.8804687262,-0.4093749821,0.2092187405,0.8700000048,1.080156326,0.9763528109,-0.6563335657,1.896548152,-2.231715202,-9.520463943,-3.523556709,-1.300929427,4.204999924,10.6261673,3.42241478,-15.94774055,1.098550439,1.951040983,-0.02675159648
  4 | 0.2254117578,0.4500572979,-0.8508351445,-0.4442455173,0.06046149135,0.8149444461,1.059829354,1.386060119,-0.8999018669,1.654242396,-3.17975831,-9.153206825,-3.290498018,-1.271032333,1.077785254,12.104496,3.916874886,-9.365999222,1.283721089,0.963589251,-0.04367618263
  5 | 0.2470689416,0.4359963238,-0.8249875903,-0.4939292371,-0.0825573653,0.7635304332,1.039969444,1.397001386,-1.004281044,1.643015862,-3.515737534,-8.717803001,-3.069464445,-1.207003117,0.1252228469,12.27435493,3.402727604,-6.210497379,1.053107738,0.4171015024,-0.05363409594
  6 | 0.2688970864,0.4203044176,-0.7993154526,-0.5488626361,-0.2187730372,0.7155700326,1.021110058,1.28299427,-1.029776931,1.585389137,-3.600118637,-8.291081429,-2.893771172,-1.14661479,-0.04410818592,11.87872505,3.018870592,-4.578218937,0.8713625073,0.1302656382,-0.04291247576
  7 | 0.2889438868,0.4042141438,-0.7745437622,-0.60511446,-0.3483211994,0.6703548431,1.003194213,1.128157377,-1.013815522,1.502522945,-3.573584557,-7.887279034,-2.739961624,-1.090176344,0.1007819548,11.25065517,2.73128438,-3.638435125,0.7391657233,-0.02230995893,-0.02418729849
  8 | 0.3065713346,0.3883732855,-0.7510668635,-0.6609517336,-0.4715599418,0.6275429726,0.9861602187,0.9674246311,-0.9751346707,1.410684347,-3.497129202,-7.504574776,-2.599572182,-1.036983609,0.3587346971,10.5305624,2.498670101,-3.020742893,0.6368814707,-0.1016343087,-0.004605192691
  9 | 0.3216873407,0.3731368184,-0.7290249467,-0.7155943513,-0.5888189077,0.5869246721,0.9699573517,0.8156070709,-0.9234204292,1.317762017,-3.397771358,-7.140995979,-2.468681097,-0.9866023064,0.6383093596,9.782183647,2.301772594,-2.558377743,0.5532415509,-0.1400595605,0.01300768554
 10 | 0.3344312012,0.3587083817,-0.7084349394,-0.7686845064,-0.700396955,0.5483515263,0.9545416832,0.6786612272,-0.8640217781,1.227555513,-3.28790617,-6.795248032,-2.345368624,-0.9387907982,0.897870481,9.034562111,2.131248474,-2.174699783,0.4826232493,-0.1553330123,0.02781313099
 11 | 0.3450352848,0.3452080488,-0.6892544031,-0.8200580478,-0.8065726757,0.5117051601,0.9398730993,0.5584961772,-0.8001363277,1.14192915,-3.173535109,-6.46626091,-2.228666544,-0.8933808208,1.118873596,8.301286697,1.981235147,-1.833651662,0.4219322503,-0.1573725939,0.03973370418
 12 | 0.3537617922,0.332705915,-0.6714117527,-0.8696445227,-0.9076080322,0.4768822491,0.9259140491,0.4550662041,-0.7338258028,1.061833858,-3.057766914,-6.153164387,-2.117984772,-0.850231111,1.29460752,7.589044094,1.847171783,-1.518584013,0.369430244,-0.1518730223,0.04900585487
 13 | 0.3608722091,0.3212398887,-0.654820621,-0.9174221158,-1.003751278,0.4437887371,0.9126291871,0.3673797846,-0.6665077209,0.9877112508,-2.942373753,-5.855163574,-2.012966633,-0.8092082739,1.424676538,6.900812626,1.724993825,-1.22185421,0.3238896728,-0.1419952214,0.05593191087
 14 | 0.3666125238,0.3108257055,-0.6393876076,-0.963396728,-1.095238209,0.4123361409,0.8999853134,0.2940093875,-0.5992016792,0.919688642,-2.828468561,-5.571601391,-1.913300157,-0.7702032328,1.511806488,6.238016605,1.611262918,-0.9400814772,0.2845953703,-0.1296046227,0.06088227779
 15 | 0.3712064326,0.3014631867,-0.6250174642,-1.007591605,-1.182294488,0.3824408352,0.8879508972,0.2333576381,-0.532656312,0.8576809168,-2.716795683,-5.301782608,-1.818749189,-0.733101368,1.560486078,5.601210117,1.503153563,-0.6723706126,0.2509788275,-0.1156345308,0.06419639289
 16 | 0.3748526573,0.2931404412,-0.6116161942,-1.050041556,-1.265134811,0.3540228903,0.8764961958,0.1838198304,-0.4674308896,0.8014618158,-2.607881069,-5.04499054,-1.729033947,-0.6978045106,1.575835466,4.990192413,1.39840889,-0.4183811843,0.2222726792,-0.1007424593,0.06611167639
 17 | 0.3777248561,0.2858368456,-0.5990933776,-1.090789676,-1.343962789,0.3270067275,0.8655930161,0.143859297,-0.4039468169,0.7507185936,-2.502102375,-4.80072403,-1.64397037,-0.6642226577,1.562997341,4.40459156,1.295410156,-0.1784701943,0.198166877,-0.085193187,0.06693752855
 18 | 0.3799726665,0.2795251906,-0.5873634219,-1.129885077,-1.418974161,0.3013196886,0.8552145362,0.1120715067,-0.3425167203,0.705065012,-2.399713516,-4.568335056,-1.563319683,-0.6322647929,1.526815414,3.843975544,1.192992926,0.04716718569,0.1780515462,-0.06919217855,0.06686349213
 19 | 0.3817237914,0.2741733789,-0.576346755,-1.167380571,-1.490354419,0.2768928111,0.8453354239,0.08719525486,-0.2833710611,0.6640833616,-2.300871611,-4.34730196,-1.48684597,-0.6018499732,1.471623421,3.307807446,1.090525031,0.2584060729,0.1615772843,-0.0529362224,0.06611222029
 20 | 0.3830862045,0.2697457075,-0.5659704804,-1.203331709,-1.558281064,0.2536608577,0.8359315395,0.06811897457,-0.2266747952,0.6273411512,-2.205647469,-4.137019157,-1.414343953,-0.5728984475,1.40141201,2.795626402,0.9878099561,0.4547972977,0.148204416,-0.03656005114,0.06481542438
 21 | 0.3841505647,0.2662039101,-0.5561682582,-1.237794995,-1.622921944,0.2315617353,0.8269799948,0.05387430638,-0.1725497097,0.5944211483,-2.114062786,-3.936994553,-1.345598578,-0.5453414917,1.319521785,2.306937933,0.8847957253,0.6364368796,0.1375202388,-0.02020245604,0.0631146729
 22 | 0.3849923611,0.2635078132,-0.546880424,-1.270827174,-1.684437513,0.2105367631,0.818459034,0.04363106191,-0.1210781634,0.5649145246,-2.026085615,-3.746736765,-1.280396581,-0.5191071033,1.228804231,1.8413167,0.7817070484,0.80339396,0.1291450709,-0.004029831383,0.06112636253
 23 | 0.3856740892,0.2616159618,-0.5380536318,-1.302484751,-1.742980242,0.1905305684,0.8103479743,0.03667804599,-0.07231400907,0.5384389162,-1.941648245,-3.565755606,-1.218546629,-0.4941310883,1.131683707,1.398428082,0.6788892746,0.9559422135,0.1227005869,0.01181147899,0.05894094706
 24 | 0.3862471879,0.2604860663,-0.5296404958,-1.332823038,-1.798695207,0.1714907736,0.8026272058,0.03240851313,-0.02628783137,0.5146428347,-1.860655069,-3.393583775,-1.159866929,-0.4703538716,1.030238032,0.9780305624,0.5767514706,1.094190121,0.1177949458,0.02719751,0.05661784485
 25 | 0.3867535591,0.2600753307,-0.5215991735,-1.3618958,-1.851719975,0.1533678472,0.7952779531,0.03031272069,0.01698634401,0.4932011068,-1.783000231,-3.229820967,-1.104172468,-0.4477190077,0.9260895252,0.5797529817,0.4757205248,1.218822241,0.1142327189,0.04200038686,0.0542502813
 26 | 0.3872272074,0.2603407502,-0.513892889,-1.38975513,-1.902185917,0.1361151487,0.7882823348,0.02995982207,0.05751252547,0.4738182127,-1.708558321,-3.073982,-1.051291943,-0.4261698723,0.8207266331,0.2034440339,0.3763141036,1.330058098,0.1115580201,0.05609277263,0.05183074623
 27 | 0.3876953423,0.2612393796,-0.5064894557,-1.416451335,-1.950216889,0.1196887121,0.7816234231,0.03098231927,0.09530344605,0.4562348425,-1.637208343,-2.925774097,-1.001076579,-0.4056552947,0.7152611613,-0.1510895938,0.2789497674,1.428749561,0.1097126305,0.06940529495,0.04944870621
 28 | 0.3881794512,0.2627284825,-0.4993607998,-1.442032695,-1.995932102,0.1040468886,0.7752850652,0.03307596222,0.1303879917,0.4402119517,-1.568817377,-2.784753799,-0.9533701539,-0.3861263692,0.6106904149,-0.4841172099,0.1840743423,1.515300989,0.1084182039,0.08185072988,0.04710962996
 29 | 0.3886962533,0.264765799,-0.4924824834,-1.466545463,-2.03944397,0.08915048093,0.7692518234,0.03598498181,0.1628020108,0.4255379736,-1.503260851,-2.650568724,-0.9080362916,-0.3675367534,0.5078269243,-0.7958704233,0.09210560471,1.590422392,0.1074717715,0.09338294715,0.04482619092
 30 | 0.3892585039,0.2673095763,-0.4858334661,-1.490033865,-2.080859184,0.0749624148,0.7635090351,0.03949214891,0.1925904453,0.4120289683,-1.440416217,-2.522915602,-0.8649411201,-0.349837333,0.4072561562,-1.086704135,0.003345089033,1.654977322,0.1068040133,0.1039702445,0.04263185337
 31 | 0.389875561,0.2703188062,-0.4793955088,-1.51254034,-2.120279789,0.0614477098,0.7580428123,0.04341764003,0.2198082209,0.3995184004,-1.380160332,-2.401439667,-0.823964119,-0.332991004,0.3095520735,-1.356871963,-0.08186154068,1.709528565,0.106266737,0.1135983095,0.04052871838
 32 | 0.3905539513,0.2737533152,-0.4731530249,-1.534105301,-2.157802343,0.04857327044,0.7528398037,0.04761113971,0.2445165366,0.3878610432,-1.322379231,-2.285836458,-0.7849846482,-0.3169545829,0.2151671946,-1.606685758,-0.1632221192,1.754729033,0.1057344228,0.1222458705,0.03851189464
 33 | 0.3912978768,0.2775738835,-0.4670926929,-1.554767489,-2.193518639,0.03630788624,0.747887373,0.05194659531,0.2667831481,0.3769296408,-1.266964555,-2.175850153,-0.7479057908,-0.3016900122,0.1244329885,-1.836529255,-0.2405114174,1.791366577,0.1052514985,0.1299544573,0.03661888093
 34 | 0.3921095431,0.2817423642,-0.4612031579,-1.574563861,-2.227516413,0.02462185919,0.74317348,0.05632156879,0.2866835296,0.3666127026,-1.21381247,-2.071133137,-0.7126061916,-0.2871604562,0.0376669839,-2.046811581,-0.3135189116,1.819991589,0.1046021059,0.1367045343,0.03480178118
 35 | 0.3929895759,0.2862218022,-0.4554748237,-1.593529701,-2.25987792,0.01348738745,0.7386866212,0.06064805388,0.304296881,0.3568146229,-1.162825584,-1.971506596,-0.6790114045,-0.2733300924,-0.04491040483,-2.237961054,-0.3820841908,1.841253757,0.1038670093,0.142543748,0.03309428319
 36 | 0.3939372003,0.2909764349,-0.4498995841,-1.611698866,-2.290682793,0.002877834253,0.7344158292,0.06485706568,0.3197085857,0.3474510908,-1.113912582,-1.876693487,-0.6470288634,-0.2601676881,-0.1231659129,-2.410532713,-0.4461358488,1.855897069,0.103025943,0.147525996,0.03149637952
 37 | 0.3949505985,0.2959718704,-0.4444706738,-1.62910378,-2.320006132,-0.007231991738,0.7303507328,0.06889312714,0.3330090046,0.3384487033,-1.06698513,-1.786428213,-0.6165630817,-0.247637406,-0.196956858,-2.565092087,-0.505572319,1.864428282,0.1020112336,0.1516682506,0.02998733148
 38 | 0.3960270584,0.3011751473,-0.4391824007,-1.645775437,-2.347918987,-0.01686578989,0.7264813781,0.07270992547,0.3442910612,0.3297465444,-1.021960497,-1.700507045,-0.5875444412,-0.2357103527,-0.2662021816,-2.702188492,-0.5603578091,1.867390394,0.1008147448,0.1550169736,0.02856246382
 39 | 0.3971631527,0.3065547049,-0.4340301156,-1.661743522,-2.374489307,-0.02604617178,0.7227984071,0.07627151161,0.353651017,0.3212923706,-0.9787601829,-1.618739486,-0.5599068999,-0.2243593484,-0.3308574855,-2.822480202,-0.6104848981,1.865313768,0.09948141873,0.1576333344,0.02723096125
 40 | 0.398354888,0.3120805025,-0.4290099144,-1.677036643,-2.399782181,-0.03479471803,0.7192928195,0.07955127954,0.3611875176,0.3130425215,-0.9373106956,-1.540904403,-0.5335736871,-0.2135564536,-0.390930146,-2.926653147,-0.6560022831,1.858740449,0.09799384326,0.1595636308,0.02598120831
 41 | 0.3995978832,0.3177240491,-0.424118638,-1.6916821,-2.423858881,-0.04313180596,0.7159559727,0.08252950013,0.3670010865,0.3049601912,-0.8975406289,-1.46681416,-0.5084808469,-0.203272596,-0.446426183,-3.015343428,-0.6969528198,1.848058105,0.09636003524,0.1608557403,0.02480675653
 42 | 0.4008873999,0.3234584332,-0.4193536341,-1.70570612,-2.446777821,-0.05107681826,0.71277982,0.08519287407,0.3711932003,0.2970153093,-0.8593831658,-1.396288991,-0.4845659733,-0.1934860945,-0.4973852336,-3.089286089,-0.7334092855,1.833696604,0.09459446371,0.1615519226,0.02370432205
 43 | 0.4022185504,0.3292583227,-0.4147127569,-1.719133973,-2.46859479,-0.05864816159,0.7097566128,0.08753298223,0.373865664,0.2891843021,-0.8227740526,-1.329158068,-0.4617771506,-0.1841711998,-0.5439296365,-3.149354696,-0.7655391097,1.816161156,0.09272176772,0.1617219299,0.02267135307
 44 | 0.4035862386,0.3350999653,-0.410194248,-1.731989861,-2.489362955,-0.0658634305,0.7068789601,0.08954655379,0.3751199841,0.2814477384,-0.7876508236,-1.265249133,-0.4400544763,-0.1753051877,-0.5860841274,-3.196105003,-0.7934034467,1.795710206,0.09073790163,0.1613986343,0.02169793844
 45 | 0.4049853981,0.3409612179,-0.4057966173,-1.744296908,-2.509132385,-0.07273928076,0.7041398287,0.09123361856,0.3750574291,0.2737919688,-0.7539542317,-1.204413652,-0.4193477631,-0.1668665409,-0.6239601374,-3.230334282,-0.8171652555,1.77268827,0.08867237717,0.160630241,0.02078399248
 46 | 0.4064109325,0.3468214869,-0.4015186131,-1.756077409,-2.527951241,-0.07929158956,0.7015325427,0.09259895235,0.3737780154,0.2662054598,-0.7216275334,-1.14649725,-0.3996084034,-0.1588333845,-0.6576936841,-3.252872944,-0.8369845152,1.747371078,0.08651776612,0.1594576091,0.01991752908
 47 | 0.4078578055,0.3526617587,-0.3973591626,-1.767352819,-2.545865297,-0.08553547412,0.6990507841,0.09364801645,0.3713790774,0.2586829066,-0.6906166673,-1.091372728,-0.3807921708,-0.1511891037,-0.6873921156,-3.264338493,-0.8530156016,1.7200526,0.08432210237,0.1579260975,0.0191057194
 48 | 0.4093210697,0.3584645689,-0.3933172524,-1.778143764,-2.562917948,-0.09148535132,0.6966884732,0.09439046681,0.3679571152,0.2512199581,-0.6608693004,-1.03888905,-0.3628526032,-0.1439122111,-0.7132634521,-3.265720606,-0.8655019999,1.691038251,0.08206364512,0.1560740918,0.01833258756
 49 | 0.410795927,0.3642138839,-0.3893919289,-1.788469791,-2.579150677,-0.09715492278,0.6944398284,0.09483641386,0.3636045456,0.2438155264,-0.6323341131,-0.9889353514,-0.3457535505,-0.1369870156,-0.7354079485,-3.257618666,-0.8745895028,1.660505891,0.07978202403,0.1539425999,0.01760373265
 50 | 0.4122777581,0.3698952198,-0.385582298,-1.798349977,-2.594602823,-0.1025573239,0.6922994256,0.09499890357,0.3584116995,0.2364700139,-0.6049630642,-0.9413807392,-0.3294505775,-0.1303950399,-0.7540143132,-3.240794659,-0.8804967999,1.628680706,0.07749184966,0.1515660137,0.01691614091
 51 | 0.4137621224,0.375495404,-0.3818874657,-1.807802558,-2.609311819,-0.1077049896,0.6902620196,0.09489215165,0.3524655104,0.2291852981,-0.5787097216,-0.8960977793,-0.3139026165,-0.1241199672,-0.7692053914,-3.215848207,-0.8833800554,1.595705152,0.07516720891,0.1489609182,0.01625573821
 52 | 0.4152448177,0.3810026646,-0.3783064485,-1.81684494,-2.623313427,-0.1126097143,0.6883226633,0.09453035146,0.3458498716,0.2219657749,-0.5535284877,-0.8529978991,-0.2990829647,-0.1181475073,-0.7811929584,-3.183618069,-0.8834796548,1.561792135,0.07284149528,0.1461701095,0.01562676579
 53 | 0.4167218506,0.3864065707,-0.374838233,-1.825493813,-2.636641502,-0.1172828823,0.6864765882,0.09392879158,0.3386443257,0.2148166001,-0.5293762684,-0.811976552,-0.284955889,-0.1124631241,-0.790168345,-3.144799471,-0.8810173273,1.527165771,0.07055652142,0.1432292461,0.0150366202
 54 | 0.4181894958,0.3916978836,-0.3714817166,-1.833765268,-2.649328709,-0.1217353195,0.6847193241,0.09310439974,0.3309248686,0.2077422142,-0.5062111616,-0.7729048133,-0.2714823484,-0.1070507988,-0.7962448001,-3.099918842,-0.8761274815,1.491796136,0.06823741645,0.1401369125,0.01445809659
 55 | 0.4196442664,0.3968685865,-0.3682357371,-1.841674805,-2.661405325,-0.1259772331,0.683046639,0.0920720771,0.3227631748,0.2007507682,-0.4839936495,-0.7357316017,-0.2586413324,-0.1019009575,-0.7996664047,-3.049725294,-0.8690789938,1.456003904,0.06598052382,0.1369453669,0.01391632762
 56 | 0.4210828841,0.4019117653,-0.3650990129,-1.849237204,-2.672901154,-0.1300185025,0.6814544201,0.09084951133,0.3142274022,0.1938471049,-0.4626848698,-0.7003312707,-0.2463947833,-0.09699727595,-0.8005658984,-2.994710684,-0.8600105643,1.41974771,0.06372748315,0.1336484998,0.01338967122
 57 | 0.4225023985,0.4068215787,-0.3620701432,-1.856466651,-2.683843851,-0.1338684261,0.6799388528,0.08945222944,0.3053812683,0.1870390773,-0.4422482848,-0.6666388512,-0.234723106,-0.09233058244,-0.7991551161,-2.935563326,-0.849153161,1.383213282,0.06151872501,0.1302839965,0.01288756449
 58 | 0.4239000976,0.411593169,-0.3591476679,-1.863376737,-2.69426012,-0.1375359744,0.678496182,0.08789675683,0.296284169,0.180333063,-0.4226492345,-0.634562254,-0.2235943377,-0.08788838983,-0.7955752611,-2.872694731,-0.8366603851,1.346513033,0.05934936553,0.1268644035,0.01240567211
 59 | 0.425273478,0.4162226021,-0.3563299775,-1.869980574,-2.704175234,-0.141029641,0.6771229506,0.08619936556,0.2869923413,0.1737352312,-0.403853178,-0.6040218472,-0.2129831165,-0.08365878463,-0.7899918556,-2.806655169,-0.8226977587,1.309664249,0.05720510706,0.1233965158,0.01193705201
 60 | 0.4266203344,0.4207068682,-0.3536153734,-1.876290798,-2.713613033,-0.1443575025,0.6758157611,0.08437518775,0.2775573134,0.1672526747,-0.3858281374,-0.5749567747,-0.2028692812,-0.07963341475,-0.7825935483,-2.7379601,-0.8074572682,1.272821307,0.0551183708,0.1199057177,0.01148956735
 61 | 0.4279386997,0.4250437021,-0.3510020375,-1.882319331,-2.722596645,-0.1475273371,0.6745715141,0.08243972808,0.268026948,0.1608909667,-0.368543148,-0.5472840667,-0.1932267696,-0.07580138743,-0.7735576034,-2.667135,-0.7911096215,1.236053228,0.05307078734,0.116399467,0.01105566602
 62 | 0.4292268157,0.4292316139,-0.3484881222,-1.888077855,-2.731148005,-0.150546506,0.6733871102,0.08040711284,0.2584446073,0.1546560228,-0.3519683182,-0.5209433436,-0.1840365678,-0.07215411961,-0.7630058527,-2.594482422,-0.7737811208,1.199434876,0.0510825254,0.112896204,0.01064046752
 63 | 0.4304831624,0.4332697988,-0.3460716307,-1.893577337,-2.739287853,-0.1534220725,0.6722596884,0.07829188555,0.2488508672,0.148552537,-0.3360753357,-0.4958604872,-0.1752726436,-0.06868086755,-0.7510743141,-2.520379066,-0.7555931807,1.162952781,0.04912081361,0.1093886942,0.0102323601
 64 | 0.4317064583,0.4371581078,-0.3437505066,-1.898828506,-2.747035742,-0.1561607122,0.6711865664,0.07610679418,0.2392823547,0.1425861269,-0.3208371997,-0.4719943404,-0.1669227779,-0.0653757453,-0.7379370332,-2.445242167,-0.7367148399,1.126767397,0.04723277315,0.1059076786,0.009844734333
 65 | 0.4328956306,0.4408968985,-0.3415226042,-1.903841615,-2.754410744,-0.1587688774,0.670165062,0.07386527956,0.2297725379,0.1367601305,-0.3062281609,-0.4492661357,-0.1589601487,-0.06222908944,-0.7236613631,-2.369223833,-0.7172117829,1.090846896,0.0453822799,0.1024434716,0.009466071613
 66 | 0.4340497851,0.4444870949,-0.3393857181,-1.908626437,-2.761430502,-0.1612526327,0.6691927314,0.07157991081,0.2203524262,0.1310779154,-0.2922231555,-0.4276319742,-0.1513710022,-0.05923350528,-0.7084555626,-2.292855978,-0.6972549558,1.055290937,0.04358210787,0.09901089966,0.009099178948
 67 | 0.4351682067,0.4479300976,-0.3373376131,-1.913192391,-2.768112183,-0.1636178046,0.6682671905,0.0692614764,0.2110484838,0.1255432069,-0.2787985504,-0.407040894,-0.1441388428,-0.05638191476,-0.692502737,-2.216587543,-0.6770020127,1.020180821,0.04183861986,0.09562198818,0.008745257743
 68 | 0.4362504184,0.4512277246,-0.3353759944,-1.917548656,-2.774472237,-0.1658699811,0.6673862338,0.06691975892,0.2018831968,0.1201589033,-0.2659319937,-0.3874396384,-0.1372458339,-0.05366767198,-0.6758520007,-2.140482187,-0.6565001607,0.9855268598,0.0401469171,0.09227785468,0.008402713574
 69 | 0.4372960329,0.4543821514,-0.3334985077,-1.921703815,-2.780525923,-0.1680144519,0.6665476561,0.06456464529,0.1928775162,0.1149269342,-0.2536018491,-0.368780762,-0.1306763291,-0.05108390749,-0.6586300731,-2.064765692,-0.6358581185,0.9514898062,0.03850866482,0.08899307251,0.008071147837
 70 | 0.4383048415,0.4573958516,-0.3317027688,-1.925666332,-2.786288023,-0.1700562686,0.6657494903,0.06220534444,0.1840503663,0.1098481715,-0.2417849749,-0.3510194421,-0.1244152784,-0.04862380028,-0.6409112215,-1.989667296,-0.6151273847,0.9178655148,0.03692013025,0.08575350791,0.007750111632
 71 | 0.4392768145,0.4602716267,-0.3299863935,-1.929444194,-2.791772604,-0.1720002592,0.6649897695,0.05985035002,0.1754169315,0.1049239114,-0.2304640412,-0.3341118395,-0.1184485778,-0.0462824516,-0.6228089333,-1.915370822,-0.5944007635,0.8848097324,0.03538250923,0.08257258683,0.007439515088
 72 | 0.4402119815,0.4630125165,-0.3283469677,-1.933045149,-2.796993017,-0.1738510132,0.6642665863,0.05750740692,0.1669907272,0.1001545414,-0.2196200341,-0.3180170357,-0.1127626225,-0.04405361041,-0.6044149399,-1.842044592,-0.5737478733,0.8523536921,0.03389493749,0.0794538334,0.007138856687
 73 | 0.4411105216,0.4656217396,-0.3267820477,-1.936476707,-2.801962137,-0.1756129265,0.6635782719,0.05518358201,0.1587831378,0.09553999454,-0.2092347294,-0.3026961982,-0.1073444933,-0.04193189368,-0.5858139992,-1.769841313,-0.5532319546,0.8205248713,0.03245694563,0.07640042156,0.006847959477
 74 | 0.4419727623,0.4681027234,-0.3252892494,-1.939746022,-2.806691647,-0.1772901863,0.6629230976,0.05288530141,0.1508035362,0.09107973427,-0.1992906481,-0.2881121635,-0.102181673,-0.03991211951,-0.5666686893,-1.697759032,-0.5325611234,0.7889673114,0.0310611017,0.07338098437,0.006565776188
 75 | 0.4427990913,0.4704590142,-0.3238661289,-1.942859888,-2.811193466,-0.1788867712,0.6622994542,0.05062165856,0.1430660188,0.08677367121,-0.1897743046,-0.2742264867,-0.09726103395,-0.03799046203,-0.5481389165,-1.628986716,-0.5126609206,0.7584556937,0.0297195632,0.07046501338,0.00629402278
 76 | 0.443590045,0.4726944268,-0.3225103021,-1.9458251,-2.815478325,-0.180406481,0.6617058516,0.04839053005,0.1355638504,0.08262271434,-0.1806692034,-0.2610118687,-0.09257523715,-0.03616001457,-0.5293614864,-1.560881615,-0.4928801954,0.7286660075,0.02842600457,0.0676234439,0.006031180266
 77 | 0.4443461597,0.4748126268,-0.3212193251,-1.948648095,-2.819556713,-0.1818529665,0.6611408591,0.04620121419,0.1283075362,0.07862105221,-0.1719565541,-0.2484323829,-0.08811034262,-0.03441727906,-0.5106551051,-1.494330406,-0.4734219313,0.6995820999,0.02717776224,0.06485396624,0.005776790436
 78 | 0.4450680614,0.4768174291,-0.3199908733,-1.951334953,-2.823438406,-0.1832296848,0.660603106,0.04405746982,0.1212999374,0.07476712763,-0.1636214405,-0.2364581972,-0.08385671675,-0.03275864571,-0.4920838475,-1.429409862,-0.4543305039,0.6712672114,0.0259762872,0.06216234341,0.005531410687
 79 | 0.4457564652,0.4787127376,-0.3188226223,-1.953891516,-2.827133179,-0.1845399439,0.6600912809,0.04196260497,0.1145428047,0.07105877995,-0.1556486189,-0.2250598073,-0.07980436087,-0.03117946722,-0.473682791,-1.366177082,-0.4356205761,0.6436408162,0.02481738105,0.05954186246,0.005294112954
 80 | 0.4464121163,0.4805024564,-0.3177123368,-1.956323504,-2.830649853,-0.185786888,0.6596040726,0.03991941363,0.1080363393,0.06749398261,-0.1480251402,-0.2142102271,-0.07594421506,-0.02967649326,-0.4555028975,-1.304680347,-0.4173257053,0.616758585,0.02370197698,0.0569970198,0.005065237172
 81 | 0.4470358491,0.4821905196,-0.3166577518,-1.958636403,-2.833996773,-0.1869735122,0.6591403484,0.03793036193,0.1017799005,0.06407012045,-0.1407376528,-0.203882888,-0.07226729393,-0.02824584767,-0.4375836551,-1.24495542,-0.3994668722,0.590626955,0.022628773,0.0545280017,0.004844472278
 82 | 0.4476284981,0.4837808311,-0.315656662,-1.960835457,-2.837182522,-0.1881026924,0.6586990356,0.03599748015,0.09577192366,0.06078436971,-0.1337732822,-0.1940526813,-0.06876513362,-0.02688404918,-0.4199605584,-1.187028885,-0.3820621967,0.5652502179,0.02159666456,0.05213497579,0.004631639458
 83 | 0.4481909573,0.4852772653,-0.3147068918,-1.962925673,-2.840214491,-0.1891771406,0.6582790017,0.03412241861,0.09001000971,0.05763369426,-0.1271196008,-0.1846956909,-0.06542956084,-0.0255877506,-0.402664125,-1.130917907,-0.365125984,0.5406318307,0.02060463652,0.04981780052,0.004426551983
 84 | 0.4487241209,0.4866836667,-0.313806355,-1.964911938,-2.843100309,-0.1901994795,0.6578791738,0.03230649978,0.08449103683,0.05461483449,-0.1207645983,-0.175789088,-0.06225283816,-0.02435382456,-0.3857217133,-1.076632619,-0.3486687541,0.5167713761,0.01965146698,0.04757606983,0.004228991456
 85 | 0.4492289126,0.48800385,-0.3129529953,-1.966798902,-2.84584713,-0.1911721826,0.6574986577,0.03055064008,0.07921122015,0.05172451213,-0.1146966517,-0.1673113108,-0.05922760442,-0.02317926101,-0.36915645,-1.024174333,-0.3326985538,0.4936674833,0.01873614267,0.04540929943,0.004038783256
 86 | 0.4497062564,0.4892415106,-0.3121447861,-1.968591094,-2.84846139,-0.1920976192,0.6571364999,0.02885544114,0.07416622341,0.04895931482,-0.1089045405,-0.1592417359,-0.0563467741,-0.02206124365,-0.3529878557,-0.9735395908,-0.3172212839,0.4713168144,0.01785758696,0.04331680387,0.003855767194
 87 | 0.4501571357,0.4904003441,-0.3113797903,-1.970292687,-2.850949526,-0.1929780394,0.6567918062,0.02722124569,0.06935122609,0.04631572217,-0.1033774242,-0.1515607387,-0.05360361934,-0.02099700831,-0.3372320533,-0.924716413,-0.3022401631,0.449714154,0.01701463945,0.04129773006,0.003679719288
 88 | 0.4505824745,0.4914839566,-0.3106561005,-1.971907973,-2.853317738,-0.193815589,0.6564637423,0.02564814687,0.06476099044,0.04379013553,-0.0981048122,-0.1442496777,-0.05099172145,-0.01998400688,-0.3219030499,-0.8776893616,-0.2877560258,0.4288532734,0.01620633528,0.03935112432,0.003510491457
 89 | 0.4509832263,0.4924958348,-0.309971869,-1.973440886,-2.855571747,-0.1946123391,0.6561514735,0.02413593791,0.06038993597,0.04137897864,-0.09307654947,-0.1372907311,-0.04850495607,-0.01901976578,-0.3070108294,-0.8324378133,-0.2737682164,0.4087259471,0.01543151122,0.03747589886,0.003347876016
 90 | 0.4513603449,0.493439436,-0.3093253076,-1.974895239,-2.857716799,-0.1953702271,0.6558542848,0.02268424816,0.05623219535,0.03907858953,-0.08828282356,-0.1306670308,-0.0461374484,-0.01810194924,-0.292563647,-0.7889370322,-0.2602742016,0.3893229365,0.01468919124,0.03567082062,0.003191707423
 91 | 0.4517147839,0.494318068,-0.3087147176,-1.97627461,-2.859758377,-0.1960911304,0.6555714607,0.02129248343,0.05228166655,0.03688533232,-0.08371414244,-0.1243624166,-0.04388364032,-0.017228337,-0.2785670459,-0.7471584678,-0.2472697049,0.3706340194,0.01397831831,0.03393467516,0.003041805001
 92 | 0.4520474672,0.4951349795,-0.3081383705,-1.977582693,-2.861701488,-0.1967768073,0.6553022861,0.01995986328,0.048532065,0.0347955972,-0.07936131954,-0.1183615476,-0.04173816368,-0.01639678143,-0.2650242448,-0.7070704699,-0.2347497791,0.3526471257,0.01329779997,0.03226603195,0.002897963859
 93 | 0.4523593485,0.4958932996,-0.3075946867,-1.978822708,-2.863550901,-0.1974289715,0.6550461054,0.01868549362,0.04497697577,0.03280573338,-0.07521548122,-0.1126498356,-0.03969594464,-0.01560529787,-0.2526293695,-0.6706873178,-0.2232796699,0.3356391191,0.01265542675,0.03069613315,0.002762634773
 94 | 0.4526513219,0.4965960681,-0.3070820868,-1.979997993,-2.865311146,-0.1980492175,0.6548022628,0.01746317744,0.0415959619,0.03091059253,-0.07127005607,-0.1072124243,-0.03775122017,-0.01485297363,-0.2399577945,-0.6337639093,-0.2116873711,0.3190208673,0.01203166042,0.02915699966,0.002630427247
 95 | 0.452924192,0.4972459972,-0.3065991104,-1.981111646,-2.866986275,-0.1986390799,0.6545701623,0.01629783213,0.03839724511,0.02910715714,-0.06751421094,-0.1020379066,-0.03590139747,-0.01413557492,-0.2277424783,-0.5984229445,-0.20055224,0.3030598462,0.01143565774,0.0276815258,0.002503463766
 96 | 0.4531788528,0.4978459477,-0.3061442971,-1.982166529,-2.86858058,-0.1992000341,0.6543492675,0.01518779434,0.03537417203,0.02739254758,-0.06394010782,-0.09711292386,-0.03414084017,-0.013453044,-0.2159790248,-0.5646249652,-0.1898676902,0.2877433896,0.01086638402,0.02626715787,0.002382005565
 97 | 0.4534161687,0.4983986616,-0.3057162762,-1.983165622,-2.870097876,-0.1997334808,0.6541390419,0.01413150225,0.0325201191,0.02576362714,-0.06054004282,-0.09242533892,-0.03246531636,-0.01280340832,-0.2046622187,-0.5323289633,-0.1796251386,0.2730561495,0.01032263227,0.02491227351,0.002265748335
 98 | 0.4536369741,0.4989067912,-0.3053137064,-1.984111547,-2.871541977,-0.2002407461,0.6539390087,0.01312741823,0.02982849628,0.02421717159,-0.05730658025,-0.08796370775,-0.03087086044,-0.01218507718,-0.1937855333,-0.5014916062,-0.1698149443,0.2589823008,0.009803353809,0.02361527458,0.002154496498
 99 | 0.4538421035,0.4993728697,-0.3049353063,-1.985006928,-2.87291646,-0.2007230967,0.6537486315,0.01217402518,0.02729279175,0.02274991758,-0.05423253775,-0.08371718228,-0.0293536596,-0.01159654465,-0.1833422482,-0.4720694125,-0.1604266167,0.2455061525,0.009307648055,0.02237456664,0.002048079856
100 | 0.4540323317,0.4997993112,-0.3045798242,-1.985854268,-2.874224424,-0.2011817545,0.6535674334,0.01126977615,0.02490657009,0.021358639,-0.0513109751,-0.07967542857,-0.02791005187,-0.01103638578,-0.1733246595,-0.444018811,-0.1514490694,0.2326106429,0.00883462932,0.02118841745,0.001946338336
101 | 0.4542084336,0.5001884699,-0.3042460978,-1.986655951,-2.875469446,-0.2016178519,0.6533949971,0.01041310374,0.02266349085,0.02004017495,-0.04853520542,-0.0758285895,-0.02653654106,-0.01050323714,-0.1637242883,-0.417295754,-0.1428709626,0.2202794403,0.008383398876,0.02005516179,0.001849099761
102 | 0.4543711245,0.5005425811,-0.3039329648,-1.98741436,-2.876654387,-0.2020324916,0.6532309055,0.009602421895,0.02055733092,0.01879142225,-0.0458987765,-0.07216730714,-0.02522980422,-0.009995797649,-0.154532209,-0.3918561041,-0.1346806437,0.2084955275,0.007953152061,0.01897310466,0.001756212441
103 | 0.4545211494,0.5008637905,-0.3036393523,-1.988131523,-2.877782106,-0.2024267018,0.6530747414,0.008836132474,0.01858198829,0.01760936715,-0.04339547083,-0.06868264079,-0.02398666367,-0.009512835182,-0.1457388699,-0.3676556051,-0.1268662065,0.1972418576,0.00754306186,0.01794055663,0.001667518751
104 | 0.4546592236,0.5011541247,-0.3033642173,-1.988809586,-2.878855228,-0.2028014958,0.6529260874,0.008112620562,0.01673150063,0.01649108902,-0.04101929441,-0.06536606699,-0.02280407585,-0.009053166024,-0.1373342127,-0.3446505964,-0.119415924,0.1865013987,0.007152289618,0.01695582084,0.00158285175
105 | 0.4547859728,0.5014155507,-0.3031065464,-1.989450455,-2.879876614,-0.2031578124,0.6527846456,0.007430285681,0.01500004902,0.0154337259,-0.03876447678,-0.06220950559,-0.02167916298,-0.008615679108,-0.1293216944,-0.3228315115,-0.1123270616,0.1762599647,0.006778756622,0.01601735502,0.001501632622
106 | 0.4549020827,0.5016499162,-0.302865386,-1.990056157,-2.880848646,-0.2034965456,0.6526499987,0.006787302904,0.01338172052,0.01443466451,-0.03662548214,-0.05920651928,-0.02060947567,-0.008199578151,-0.1216609851,-0.3020819724,-0.1055682003,0.1664951593,0.006425974425,0.01512358617,0.001425127848
107 | 0.4550081491,0.5018590093,-0.302639842,-1.990628481,-2.88177371,-0.2038185745,0.6525219083,0.006182392593,0.01187130623,0.01349099912,-0.03459697962,-0.05634705722,-0.01959168166,-0.007803237066,-0.1143573448,-0.282399267,-0.09913746268,0.1571926177,0.006088545546,0.01427233033,0.001351634157
108 | 0.4551047385,0.5020444989,-0.3024290502,-1.991169095,-2.88265419,-0.2041246891,0.6523999572,0.005613923538,0.01046345942,0.01260021795,-0.03267377988,-0.05362561345,-0.01862370037,-0.007426050026,-0.1073997021,-0.2637417912,-0.09302335978,0.1483357847,0.005767463706,0.01346235909,0.001281589153
109 | 0.4551924467,0.5022079945,-0.3022321761,-1.991679668,-2.883491993,-0.204415679,0.6522839069,0.005080338102,0.009153005667,0.01175983716,-0.0308509469,-0.05103553832,-0.0177031178,-0.0070670899,-0.1007769257,-0.2460685223,-0.08721423149,0.1399082392,0.005462038796,0.0126920836,0.001214856165
110 | 0.4552718401,0.502350986,-0.3020484149,-1.992161751,-2.884289503,-0.2046922892,0.6521734595,0.00458010193,0.007934940048,0.01096746232,-0.02912373841,-0.04857050255,-0.01682764664,-0.006725465879,-0.09447791427,-0.2293392569,-0.08169858158,0.1318937093,0.005171630532,0.01195993926,0.001151309232
111 | 0.4553433955,0.5024749637,-0.3018770516,-1.992616773,-2.885048389,-0.2049552202,0.6520683765,0.004111705814,0.006804431789,0.01022079308,-0.02748760395,-0.04622444883,-0.01599510945,-0.006400333717,-0.08849134296,-0.2135147899,-0.07646526396,0.1242762804,0.004895556718,0.01126441732,0.001090806327
112 | 0.4554076493,0.5025812984,-0.3017173409,-1.993046284,-2.885770559,-0.2052051425,0.6519683599,0.003673688974,0.00575682288,0.009517588653,-0.02593817562,-0.04399163648,-0.01520343963,-0.006090905983,-0.08280608058,-0.1985568255,-0.07150306553,0.1170402542,0.004633172881,0.01060403511,0.001033217646
113 | 0.4554650486,0.5026712418,-0.3015686274,-1.993451595,-2.88645792,-0.2054426968,0.6518731713,0.00326461927,0.004787627608,0.008855705149,-0.0244712662,-0.04186661541,-0.01445065998,-0.005796423648,-0.07741107047,-0.1844279468,-0.06680120528,0.1101703122,0.004383870866,0.009977333248,0.0009784203721
114 | 0.4555160701,0.5027460456,-0.3014302552,-1.993833899,-2.887112141,-0.2056684941,0.6517826319,0.00288310647,0.003892531618,0.008233074099,-0.02308285609,-0.03984420002,-0.01373489946,-0.005516166799,-0.07229542732,-0.1710918993,-0.06234893203,0.1036514193,0.004147099797,0.009382919408,0.0009263054817
115 | 0.4555611312,0.5028068423,-0.3013015985,-1.994194627,-2.887734652,-0.2058831006,0.6516964436,0.002527792007,0.003067390062,0.007647718769,-0.02176910266,-0.03791942447,-0.01305435225,-0.005249446724,-0.067448318,-0.1585132927,-0.05813586712,0.09746901691,0.003922280855,0.008819400333,0.0008767513791
116 | 0.4556006193,0.5028547645,-0.3011820912,-1.994534731,-2.888327122,-0.2060870677,0.6516144276,0.002197360387,0.002308227122,0.007097736932,-0.02052631974,-0.03608758003,-0.01240732521,-0.004995612893,-0.06285914034,-0.1466578543,-0.05415191501,0.09160877019,0.003708850592,0.008285460062,0.0008296433953
117 | 0.4556349516,0.5028908253,-0.3010711968,-1.994855404,-2.888890982,-0.2062809318,0.6515363455,0.001890536048,0.001611231826,0.006581305992,-0.01935098134,-0.03434419632,-0.01179218199,-0.004754042253,-0.05851742253,-0.1354924291,-0.05038718507,0.08605674654,0.003506304231,0.007779780775,0.000784878328
118 | 0.4556644857,0.5029159784,-0.3009683788,-1.995157719,-2.889427662,-0.2064651847,0.6514620781,0.001606088364,0.0009727548459,0.006096674595,-0.01823971607,-0.0326850079,-0.01120738126,-0.004524147604,-0.05459246039,-0.1254370809,-0.04698073491,0.08092487603,0.003315753071,0.007312920876,0.0007426269003
119 | 


--------------------------------------------------------------------------------
/include/common/dz.cuh:
--------------------------------------------------------------------------------
  1 | #include "utils/matrix.cuh"
  2 | 
  3 | template <typename T>
  4 | __global__
  5 | void compute_dz_kernel(uint32_t state_size, uint32_t control_size, uint32_t knot_points, T *d_G_dense, T *d_C_dense, T *d_g_val, T *d_lambda, T *d_dz){
  6 | 
  7 |     extern __shared__ T s_mem[]; 
  8 |     
  9 |     const uint32_t states_sq = state_size*state_size;
 10 |     const uint32_t states_p_controls = state_size * control_size;
 11 |     const uint32_t controls_sq = control_size * control_size;
 12 |     const uint32_t states_s_controls = state_size + control_size;
 13 |     unsigned set;
 14 | 
 15 |     for(int blockrow = blockIdx.x; blockrow < 2*knot_points-1; blockrow+=gridDim.x){
 16 | 
 17 |         set = blockrow/2;
 18 |         
 19 |         if(blockrow%2){ // control row
 20 |             // shared mem config
 21 |             //    Rkinv |   BkT
 22 |             //      C^2  |  S*C
 23 | 
 24 |             T *s_Rk_i = s_mem;
 25 |             T *s_BkT = s_Rk_i + controls_sq;
 26 |             T *s_scratch = s_BkT + states_p_controls;
 27 | 
 28 |             // load Rkinv from G
 29 |             glass::copy<T>(controls_sq, d_G_dense+set*(states_sq+controls_sq)+states_sq, s_Rk_i);
 30 | 
 31 |             // load Bk from C
 32 |             glass::copy<T>(states_p_controls, d_C_dense+set*(states_sq+states_p_controls)+states_sq, s_BkT);
 33 | 
 34 |             __syncthreads();
 35 | 
 36 |             // // compute BkT*lkp1
 37 |             gato_ATx<T>(s_scratch,
 38 |                     s_BkT,
 39 |                     d_lambda+(set+1)*state_size,
 40 |                     state_size,
 41 |                     control_size);
 42 |             __syncthreads();
 43 | 
 44 |             // subtract from rk
 45 |             gato_vec_dif(s_scratch,
 46 |                         d_g_val+set*(states_s_controls)+state_size,
 47 |                         s_scratch,
 48 |                         control_size);
 49 |             __syncthreads();
 50 | 
 51 |             // multiply Rk_i*scratch in scratch + C
 52 |             mat_vec_prod<T>( control_size, control_size,s_Rk_i,
 53 |                                                             s_scratch,
 54 |                                                             s_scratch+control_size);
 55 |             __syncthreads();
 56 |             
 57 |             // store in d_dz
 58 |             glass::copy<T>(control_size, s_scratch+control_size, d_dz+set*(states_s_controls)+state_size);
 59 | 
 60 |         }
 61 |         else{   // state row
 62 | 
 63 |             T *s_Qk_i = s_mem;
 64 |             T *s_AkT = s_Qk_i + states_sq;
 65 |             T *s_scratch = s_AkT + states_sq;
 66 |             
 67 |             // shared mem config
 68 |             //    Qkinv |  AkT | scratch
 69 |             //      S^2     S^2
 70 | 
 71 |             /// TODO: error check
 72 |             // load Qkinv from G
 73 |             glass::copy<T>(states_sq, d_G_dense+set*(states_sq+controls_sq), s_Qk_i);
 74 | 
 75 |                         ///TODO: linsys solver hasn't been checked with this change
 76 |             if(set != knot_points-1){
 77 |                 // load Ak from C
 78 |                 glass::copy<T>(states_sq, d_C_dense+set*(states_sq+states_p_controls), s_AkT);
 79 |                 __syncthreads();
 80 |                             
 81 |                 // // compute AkT*lkp1 in scratch
 82 |                 gato_ATx(s_scratch,
 83 |                         s_AkT,
 84 |                         d_lambda+(set+1)*state_size,
 85 |                         state_size,
 86 |                         state_size);
 87 |                 __syncthreads();
 88 |             }
 89 |             else{
 90 |                 for(int i = threadIdx.x; i < state_size; i+=blockDim.x){
 91 |                     s_scratch[i] = 0;
 92 |                 }
 93 |             }
 94 |             
 95 | 
 96 |             // add lk to scratch
 97 |             gato_vec_sum<T>(s_scratch,     // out
 98 |                         d_lambda+set*state_size,
 99 |                         s_scratch,
100 |                         state_size);
101 |             __syncthreads();
102 | 
103 |             // subtract from qk in scratch
104 |             gato_vec_dif<T>(s_scratch,
105 |                         d_g_val+set*(states_s_controls),
106 |                         s_scratch,
107 |                         state_size);
108 |             __syncthreads();
109 |             
110 |             
111 |             // multiply Qk_i(scratch) in Akt
112 |             mat_vec_prod<T>( state_size, state_size,s_Qk_i,
113 |                                                         s_scratch,
114 |                                                         s_AkT);
115 |             __syncthreads();
116 | 
117 |             // store in dz
118 |             glass::copy<T>(state_size, s_AkT, d_dz+set*(states_s_controls));
119 |         }
120 |     }
121 | }
122 | 
123 | 
124 | template <typename T>
125 | void compute_dz(uint32_t state_size, uint32_t control_size, uint32_t knot_points, T *d_G_dense, T *d_C_dense, T *d_g_val, T *d_lambda, T *d_dz){
126 |     
127 |     compute_dz_kernel<<<knot_points, DZ_THREADS, sizeof(T)*(2*state_size*state_size+state_size)>>>(
128 |         state_size, 
129 |         control_size, 
130 |         knot_points, 
131 |         d_G_dense, 
132 |         d_C_dense, 
133 |         d_g_val, 
134 |         d_lambda, 
135 |         d_dz
136 |     );
137 | }


--------------------------------------------------------------------------------
/include/common/integrator.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <cooperative_groups.h>
  3 | #include <algorithm>
  4 | #include <cmath>
  5 | 
  6 | namespace cgrps = cooperative_groups;
  7 | #include "dynamics/rbd_plant.cuh"
  8 | 
  9 | #include "glass.cuh"
 10 | 
 11 | 
 12 | template<typename T>
 13 | __host__ __device__ 
 14 | T angleWrap(T input){
 15 |     const T pi = static_cast<T>(3.14159);
 16 |     if(input > pi){input = -(input - pi);}
 17 |     if(input < -pi){input = -(input + pi);}
 18 |     return input;
 19 | }
 20 | 
 21 | 
 22 | template <typename T, unsigned INTEGRATOR_TYPE = 0, bool ANGLE_WRAP = false>
 23 | __device__ 
 24 | void exec_integrator_error(uint32_t state_size, T *s_err, T *s_qkp1, T *s_qdkp1, T *s_q, T *s_qd, T *s_qdd, T dt, cgrps::thread_block block, bool absval = false){
 25 |     T new_qkp1; T new_qdkp1;
 26 |     for (unsigned ind = threadIdx.x; ind < state_size/2; ind += blockDim.x){
 27 |         // euler xk = xk + dt *dxk
 28 |         if (INTEGRATOR_TYPE == 0){
 29 |             new_qkp1 = s_q[ind] + dt*s_qd[ind];
 30 |             new_qdkp1 = s_qd[ind] + dt*s_qdd[ind];
 31 |         }
 32 |         // semi-inplicit euler
 33 |         // qdkp1 = qdk + dt*qddk
 34 |         // qkp1 = qk  + dt*qdkp1
 35 |         else if (INTEGRATOR_TYPE == 1){
 36 |             new_qdkp1 = s_qd[ind] + dt*s_qdd[ind];
 37 |             new_qkp1 = s_q[ind] + dt*new_qdkp1;
 38 |         }
 39 |         else {printf("Integrator [%d] not defined. Currently support [0: Euler and 1: Semi-Implicit Euler]",INTEGRATOR_TYPE);}
 40 | 
 41 |         // wrap angles if needed
 42 |         if(ANGLE_WRAP){ printf("ANGLE_WRAP!\n");
 43 |             new_qkp1 = angleWrap(new_qkp1);
 44 |         }
 45 | 
 46 |         // then computre error
 47 |         if(absval){
 48 |             s_err[ind] = abs(s_qkp1[ind] - new_qkp1);
 49 |             s_err[ind + state_size/2] = abs(s_qdkp1[ind] - new_qdkp1);    
 50 |         }
 51 |         else{
 52 |             s_err[ind] = s_qkp1[ind] - new_qkp1;
 53 |             s_err[ind + state_size/2] = s_qdkp1[ind] - new_qdkp1;
 54 |         }
 55 |         // printf("err[%f] with new qkp1[%f] vs orig[%f] and new qdkp1[%f] vs orig[%f] with qk[%f] qdk[%f] qddk[%f] and dt[%f]\n",s_err[ind],new_qkp1,s_qkp1[ind],new_qdkp1,s_qdkp1[ind],s_q[ind],s_qd[ind],s_qdd[ind],dt);
 56 |     }
 57 | }
 58 | 
 59 | template <typename T, unsigned INTEGRATOR_TYPE = 0>
 60 | __device__
 61 | void exec_integrator_gradient(uint32_t state_size, uint32_t control_size, T *s_Ak, T *s_Bk, T *s_dqdd, T dt, cgrps::thread_block block){
 62 |         
 63 |     const uint32_t thread_id = threadIdx.x;
 64 |     const uint32_t block_dim = blockDim.x;
 65 | 
 66 |     // and finally A and B
 67 |     if (INTEGRATOR_TYPE == 0){
 68 |         // then apply the euler rule -- xkp1 = xk + dt*dxk thus AB = [I_{state},0_{control}] + dt*dxd
 69 |         // where dxd = [ 0, I, 0; dqdd/dq, dqdd/dqd, dqdd/du]
 70 |         for (unsigned ind = thread_id; ind < state_size*(state_size + control_size); ind += block_dim){
 71 |             int c = ind / state_size; int r = ind % state_size;
 72 |             T *dst = (c < state_size)? &s_Ak[ind] : &s_Bk[ind - state_size*state_size]; // dst
 73 |             T val = (r == c) * static_cast<T>(1); // first term (non-branching)
 74 |             val += (r < state_size/2 && r == c - state_size/2) * dt; // first dxd term (non-branching)
 75 |             if(r >= state_size/2) { val += dt * s_dqdd[c*state_size/2 + r - state_size/2]; }
 76 |             ///TODO: EMRE why didn't this error before?
 77 |             // val += (r >= state_size/2) * dt * s_dqdd[c*state_size/2 + r - state_size/2]; // second dxd term (non-branching)
 78 |             *dst = val;
 79 |         }
 80 |     }
 81 |     else if (INTEGRATOR_TYPE == 1){
 82 |         // semi-inplicit euler
 83 |         // qdkp1 = qdk + dt*qddk
 84 |         // qkp1 = qk  + dt*qdkp1 = qk + dt*qdk + dt^2*qddk
 85 |         // dxkp1 = [Ix | 0u ] + dt*[[0q, Iqd, 0u] + dt*dqdd
 86 |         //                                             dqdd]
 87 |         // Ak = I + dt * [[0,I] + dt*dqdd/dx; dqdd/dx]
 88 |         // Bk = [dt*dqdd/du; dqdd/du]
 89 |         for (unsigned ind = thread_id; ind < state_size*state_size; ind += block_dim){
 90 |             int c = ind / state_size; int r = ind % state_size; int rdqdd = r % (state_size/2);
 91 |             T dtVal = static_cast<T>((r == rdqdd)*dt + (r != rdqdd));
 92 |             s_Ak[ind] = static_cast<T>((r == c) + dt*(r == c - state_size/2)) +
 93 |                         dt * s_dqdd[c*state_size/2 + rdqdd] * dtVal;
 94 |             if(c < control_size){
 95 |                 s_Bk[ind] = dt * s_dqdd[state_size*state_size/2 + c*state_size/2 + rdqdd] * dtVal;
 96 |             }
 97 |         }
 98 |     }
 99 |     else{printf("Integrator [%d] not defined. Currently support [0: Euler and 1: Semi-Implicit Euler]",INTEGRATOR_TYPE);}
100 | }
101 | 
102 | 
103 | template <typename T, unsigned INTEGRATOR_TYPE = 0, bool ANGLE_WRAP = false>
104 | __device__ 
105 | void exec_integrator(uint32_t state_size, T *s_qkp1, T *s_qdkp1, T *s_q, T *s_qd, T *s_qdd, T dt, cgrps::thread_block block){
106 | 
107 |     const uint32_t thread_id = threadIdx.x;
108 |     const uint32_t block_dim = blockDim.x;
109 | 
110 |     for (unsigned ind = thread_id; ind < state_size/2; ind += block_dim){
111 |         // euler xk = xk + dt *dxk
112 |         if (INTEGRATOR_TYPE == 0){
113 |             s_qkp1[ind] = s_q[ind] + dt*s_qd[ind];
114 |             s_qdkp1[ind] = s_qd[ind] + dt*s_qdd[ind];
115 |         }
116 |         // semi-inplicit euler
117 |         // qdkp1 = qdk + dt*qddk
118 |         // qkp1 = qk  + dt*qdkp1
119 |         else if (INTEGRATOR_TYPE == 1){
120 |             s_qdkp1[ind] = s_qd[ind] + dt*s_qdd[ind];
121 |             s_qkp1[ind] = s_q[ind] + dt*s_qdkp1[ind];
122 |         }
123 |         else{printf("Integrator [%d] not defined. Currently support [0: Euler and 1: Semi-Implicit Euler]",INTEGRATOR_TYPE);}
124 | 
125 |         // wrap angles if needed
126 |         if(ANGLE_WRAP){
127 |             s_qkp1[ind] = angleWrap(s_qkp1[ind]);
128 |         }
129 |     }
130 | }
131 | 
132 | // s_temp of size state_size/2*(state_size + control_size + 1) + DYNAMICS_TEMP
133 | template <typename T, unsigned INTEGRATOR_TYPE = 0, bool ANGLE_WRAP = false, bool COMPUTE_INTEGRATOR_ERROR = false>
134 | __device__ __forceinline__
135 | void integratorAndGradient(uint32_t state_size, uint32_t control_size, T *s_xux, T *s_Ak, T *s_Bk, T *s_xnew_err, T *s_temp, void *d_dynMem_const, T dt, cgrps::thread_block block){
136 | 
137 |     
138 |     // first compute qdd and dqdd
139 |     T *s_qdd = s_temp; 	
140 |     T *s_dqdd = s_qdd + state_size/2;	
141 |     T *s_extra_temp = s_dqdd + state_size/2*(state_size+control_size);
142 |     T *s_q = s_xux; 	
143 |     T *s_qd = s_q + state_size/2; 		
144 |     T *s_u = s_qd + state_size/2;
145 |     gato_plant::forwardDynamicsAndGradient<T>(s_dqdd, s_qdd, s_q, s_qd, s_u, s_extra_temp, d_dynMem_const);
146 |     block.sync();
147 |     // first compute xnew or error
148 |     if (COMPUTE_INTEGRATOR_ERROR){
149 |         exec_integrator_error<T,INTEGRATOR_TYPE,ANGLE_WRAP>(state_size, s_xnew_err, &s_xux[state_size+control_size], &s_xux[state_size+control_size+state_size/2], s_q, s_qd, s_qdd, dt, block);
150 |     }
151 |     else{
152 |         exec_integrator<T,INTEGRATOR_TYPE,ANGLE_WRAP>(state_size, s_xnew_err, &s_xnew_err[state_size/2], s_q, s_qd, s_qdd, dt, block);
153 |     }
154 |     
155 |     // then compute gradient
156 |     exec_integrator_gradient<T,INTEGRATOR_TYPE>(state_size, control_size, s_Ak, s_Bk, s_dqdd, dt, block);
157 | }
158 | 
159 | 
160 | // s_temp of size 3*state_size/2 + DYNAMICS_TEMP
161 | template <typename T, unsigned INTEGRATOR_TYPE = 0, bool ANGLE_WRAP = false>
162 | __device__ 
163 | T integratorError(uint32_t state_size, T *s_xuk, T *s_xkp1, T *s_temp, void *d_dynMem_const, T dt, cgrps::thread_block block){
164 | 
165 |     // first compute qdd
166 |     T *s_q = s_xuk; 					
167 |     T *s_qd = s_q + state_size/2; 				
168 |     T *s_u = s_qd + state_size/2;
169 |     T *s_qkp1 = s_xkp1; 				
170 |     T *s_qdkp1 = s_qkp1 + state_size/2;
171 |     T *s_qdd = s_temp; 					
172 |     T *s_err = s_qdd + state_size/2;
173 |     T *s_extra_temp = s_err + state_size/2;
174 |     gato_plant::forwardDynamics<T>(s_qdd, s_q, s_qd, s_u, s_extra_temp, d_dynMem_const, block);
175 |     block.sync();
176 |     // if(blockIdx.x == 0 && threadIdx.x==0){
177 |     //     printf("\n");
178 |     //     for(int i = 0; i < state_size/2; i++){
179 |     //         printf("%f ", s_qdd[i]);
180 |     //     }
181 |     //     printf("\n");
182 |     // }
183 |     // block.sync();
184 |     // then apply the integrator and compute error
185 |     exec_integrator_error<T,INTEGRATOR_TYPE,ANGLE_WRAP>(state_size, s_err, s_qkp1, s_qdkp1, s_q, s_qd, s_qdd, dt, block, true);
186 |     block.sync();
187 | 
188 |     // finish off forming the error
189 |     glass::reduce<T>(state_size, s_err);
190 |     block.sync();
191 |     // if(GATO_LEAD_THREAD){printf("in integratorError with reduced error of [%f]\n",s_err[0]);}
192 |     return s_err[0];
193 | }
194 | 
195 | 
196 | 
197 | template <typename T, unsigned INTEGRATOR_TYPE = 0, bool ANGLE_WRAP = false>
198 | __device__ 
199 | void integrator(uint32_t state_size, T *s_xkp1, T *s_xuk, T *s_temp, void *d_dynMem_const, T dt, cgrps::thread_block block){
200 |     // first compute qdd
201 |     T *s_q = s_xuk; 					T *s_qd = s_q + state_size/2; 				T *s_u = s_qd + state_size/2;
202 |     T *s_qkp1 = s_xkp1; 				T *s_qdkp1 = s_qkp1 + state_size/2;
203 |     T *s_qdd = s_temp; 					T *s_extra_temp = s_qdd + state_size/2;
204 |     gato_plant::forwardDynamics<T>(s_qdd, s_q, s_qd, s_u, s_extra_temp, d_dynMem_const, block);
205 |     block.sync();
206 |     exec_integrator<T,INTEGRATOR_TYPE,ANGLE_WRAP>(state_size, s_qkp1, s_qdkp1, s_q, s_qd, s_qdd, dt, block);
207 | }
208 | 
209 | 
210 | 
211 | 
212 | template <typename T, unsigned INTEGRATOR_TYPE = 0, bool ANGLE_WRAP = false>
213 | __global__
214 | void integrator_kernel(uint32_t state_size, uint32_t control_size, T *d_xkp1, T *d_xuk, void *d_dynMem_const, T dt){
215 |     extern __shared__ T s_smem[];
216 |     T *s_xkp1 = s_smem;
217 |     T *s_xuk = s_xkp1 + state_size; 
218 |     T *s_temp = s_xuk + state_size + control_size;
219 |     cgrps::thread_block block = cgrps::this_thread_block();	  
220 |     cgrps::grid_group grid = cgrps::this_grid();
221 |     for (unsigned ind = threadIdx.x; ind < state_size + control_size; ind += blockDim.x){
222 |         s_xuk[ind] = d_xuk[ind];
223 |     }
224 | 
225 |     block.sync();
226 |     integrator<T,INTEGRATOR_TYPE,ANGLE_WRAP>(state_size, s_xkp1, s_xuk, s_temp, d_dynMem_const, dt, block);
227 |     block.sync();
228 | 
229 |     for (unsigned ind = threadIdx.x; ind < state_size; ind += blockDim.x){
230 |         d_xkp1[ind] = s_xkp1[ind];
231 |     }
232 | }
233 | 
234 | // We take start state from h_xs, and control input from h_xu, and update h_xs
235 | template <typename T>
236 | void integrator_host(uint32_t state_size, uint32_t control_size, T *d_xs, T *d_xu, void *d_dynMem_const, T dt){
237 |     // T *d_xu;
238 |     // T *d_xs_new;
239 |     // gpuErrchk(cudaMalloc(&d_xu, xu_size));
240 |     // gpuErrchk(cudaMalloc(&d_xs_new, xs_size));
241 | 
242 |     // gpuErrchk(cudaMemcpy(d_xu, h_xs, state_size*sizeof(T), cudaMemcpyHostToDevice));
243 |     // gpuErrchk(cudaMemcpy(d_xu + state_size, h_xu + state_size, control_size*sizeof(T), cudaMemcpyHostToDevice));
244 |     //TODO: needs sync?
245 | 
246 |     const size_t integrator_kernel_smem_size = sizeof(T)*(2*state_size + control_size + state_size/2 + gato_plant::forwardDynamicsAndGradient_TempMemSize_Shared());
247 |     //TODO: one block one thread? Why?
248 |     integrator_kernel<T><<<1,1, integrator_kernel_smem_size>>>(state_size, control_size, d_xs, d_xu, d_dynMem_const, dt);
249 | 
250 |     //TODO: needs sync?
251 |     // gpuErrchk(cudaMemcpy(h_xs, d_xs_new, xs_size, cudaMemcpyDeviceToHost));
252 | 
253 |     // gpuErrchk(cudaFree(d_xu));
254 |     // gpuErrchk(cudaFree(d_xs_new));
255 | }
256 | 
257 | template <typename T>
258 | void just_shift(uint32_t state_size, uint32_t control_size, uint32_t knot_points, T *d_xu){
259 |     for (uint32_t knot = 0; knot < knot_points-1; knot++){
260 |         uint32_t stepsize = (state_size+(knot<knot_points-2)*control_size);
261 |         gpuErrchk(cudaMemcpy(&d_xu[knot*(state_size+control_size)], &d_xu[(knot+1)*(state_size+control_size)], stepsize*sizeof(T), cudaMemcpyDeviceToDevice));
262 |     }
263 | }
264 | 
265 | 
266 | template <typename T>
267 | __global__
268 | void simple_integrator_kernel(uint32_t state_size, uint32_t control_size, T *d_x, T *d_u, void *d_dynMem_const, T dt){
269 | 
270 | 
271 |     extern __shared__ T s_mem[];
272 |     T *s_xkp1 = s_mem;
273 |     T *s_xuk = s_xkp1 + state_size; 
274 |     T *s_temp = s_xuk + state_size + control_size;
275 |     cgrps::thread_block block = cgrps::this_thread_block();	  
276 |     cgrps::grid_group grid = cgrps::this_grid();
277 |     for (unsigned ind = threadIdx.x; ind < state_size + control_size; ind += blockDim.x){
278 |         if(ind < state_size){
279 |             s_xuk[ind] = d_x[ind];
280 |         }
281 |         else{
282 |             s_xuk[ind] = d_u[ind-state_size];
283 |         }
284 |     }
285 | 
286 |     block.sync();
287 |     integrator<T,0,0>(state_size, s_xkp1, s_xuk, s_temp, d_dynMem_const, dt, block);
288 |     block.sync();
289 | 
290 |     for (unsigned ind = threadIdx.x; ind < state_size; ind += blockDim.x){
291 |         d_x[ind] = s_xkp1[ind];
292 |     }
293 | }
294 | 
295 | template <typename T>
296 | void simple_simulate(uint32_t state_size, uint32_t control_size, uint32_t knot_points, T *d_xs, T *d_xu, void *d_dynMem_const, double timestep, double time_offset_us, double sim_time_us, unsigned long long = 123456){
297 | 
298 |     // std::cout << "simulating for " << sim_time_us * 1e-6 << " seconds\n";
299 | 
300 | 
301 |     double time_offset = time_offset_us * 1e-6;
302 |     double sim_time = sim_time_us * 1e-6;
303 | 
304 |     const T sim_step_time = 2e-4;
305 |     const size_t simple_integrator_kernel_smem_size = sizeof(T)*(2*state_size + control_size + state_size/2 + gato_plant::forwardDynamicsAndGradient_TempMemSize_Shared());
306 |     const uint32_t states_s_controls = state_size + control_size;
307 |     uint32_t control_offset = static_cast<uint32_t>((time_offset) / timestep);
308 |     T *control = &d_xu[control_offset * states_s_controls + state_size];
309 | 
310 | 
311 |     uint32_t sim_steps_needed = static_cast<uint32_t>(sim_time / sim_step_time);
312 | 
313 | 
314 |     for(uint32_t step = 0; step < sim_steps_needed; step++){
315 |         control_offset = static_cast<uint32_t>((time_offset + step * sim_step_time) / timestep);
316 |         control = &d_xu[control_offset * states_s_controls + state_size];
317 | 
318 |         simple_integrator_kernel<T><<<1,32,simple_integrator_kernel_smem_size>>>(state_size, control_size, d_xs, control, d_dynMem_const, sim_step_time);
319 | 
320 |     }
321 | 
322 |     T half_sim_step_time = fmod(sim_time, sim_step_time);
323 | 
324 |     simple_integrator_kernel<T><<<1,32,simple_integrator_kernel_smem_size>>>(state_size, control_size, d_xs, control, d_dynMem_const, half_sim_step_time);
325 | }


--------------------------------------------------------------------------------
/include/common/kkt.cuh:
--------------------------------------------------------------------------------
  1 | 
  2 | #include "dynamics/rbd_plant.cuh"
  3 | #include "merit.cuh"
  4 | 
  5 | template <typename T>
  6 | size_t get_kkt_smem_size(uint32_t state_size, uint32_t control_size){
  7 |     const uint32_t states_sq = state_size * state_size;
  8 |     const uint32_t controls_sq = control_size * control_size;
  9 | 
 10 |     size_t smem_size = sizeof(T)*(3*states_sq + 
 11 |                                   controls_sq + 
 12 |                                   7 * state_size + 
 13 |                                   3 * control_size + 
 14 |                                   state_size*control_size + 
 15 |                                   max(grid::EE_POS_SHARED_MEM_COUNT, grid::DEE_POS_SHARED_MEM_COUNT) + 
 16 |                                   max((state_size/2)*(state_size + control_size + 1) + gato_plant::forwardDynamicsAndGradient_TempMemSize_Shared(), 3 + (state_size/2)*6));
 17 | 
 18 |     return smem_size;
 19 | }
 20 | 
 21 | 
 22 | template <typename T, unsigned INTEGRATOR_TYPE = 0, bool ANGLE_WRAP = false>
 23 | __global__
 24 | void generate_kkt_submatrices(uint32_t state_size, 
 25 |                               uint32_t control_size, 
 26 |                               uint32_t knot_points,
 27 |                               T *d_G_dense, 
 28 |                               T *d_C_dense, 
 29 |                               T *d_g, 
 30 |                               T *d_c,
 31 |                               void *d_dynMem_const, 
 32 |                               T timestep,
 33 |                               T *d_eePos_traj, 
 34 |                               T *d_xs, 
 35 |                               T *d_xu)
 36 | {
 37 | 
 38 |     const cgrps::thread_block block = cgrps::this_thread_block();
 39 |     const uint32_t thread_id = threadIdx.x;
 40 |     const uint32_t num_threads = blockDim.x;
 41 |     const uint32_t block_id = blockIdx.x;
 42 |     const uint32_t num_blocks = gridDim.x;
 43 | 
 44 |     const uint32_t states_sq = state_size*state_size;
 45 |     const uint32_t states_p_controls = state_size * control_size;
 46 |     const uint32_t controls_sq = control_size * control_size;
 47 |     const uint32_t states_s_controls = state_size + control_size;
 48 |     
 49 | 
 50 |     extern __shared__ T s_temp[];
 51 | 
 52 |     T *s_xux = s_temp;
 53 |     T *s_eePos_traj = s_xux + 2*state_size + control_size;
 54 |     T *s_Qk = s_eePos_traj + 6;
 55 |     T *s_Rk = s_Qk + states_sq;
 56 |     T *s_qk = s_Rk + controls_sq;
 57 |     T *s_rk = s_qk + state_size;
 58 |     T *s_end = s_rk + control_size;
 59 | 
 60 |     
 61 |     for(unsigned k = block_id; k < knot_points-1; k += num_blocks){
 62 | 
 63 |         glass::copy<T>(2*state_size + control_size, &d_xu[k*states_s_controls], s_xux);
 64 |         glass::copy<T>(2 * 6, &d_eePos_traj[k*6], s_eePos_traj);
 65 |         
 66 |         __syncthreads();    
 67 | 
 68 |         if(k==knot_points-2){          // last block
 69 | 
 70 |             T *s_Ak = s_end;
 71 |             T *s_Bk = s_Ak + states_sq;
 72 |             T *s_Qkp1 = s_Bk + states_p_controls;
 73 |             T *s_qkp1 = s_Qkp1 + states_sq;
 74 |             T *s_integrator_error = s_qkp1 + state_size;
 75 |             T *s_extra_temp = s_integrator_error + state_size;
 76 |             
 77 |             integratorAndGradient<T, INTEGRATOR_TYPE, ANGLE_WRAP, true>(
 78 |                 state_size, control_size,
 79 |                 s_xux,
 80 |                 s_Ak,
 81 |                 s_Bk,
 82 |                 s_integrator_error,
 83 |                 s_extra_temp,
 84 |                 d_dynMem_const,
 85 |                 timestep,
 86 |                 block
 87 |             );
 88 |             __syncthreads();
 89 |             
 90 |             gato_plant::trackingCostGradientAndHessian_lastblock<T>(
 91 |                 state_size,
 92 |                 control_size,
 93 |                 s_xux,
 94 |                 s_eePos_traj,
 95 |                 s_Qk,
 96 |                 s_qk,
 97 |                 s_Rk,
 98 |                 s_rk,
 99 |                 s_Qkp1,
100 |                 s_qkp1,
101 |                 s_extra_temp,
102 |                 d_dynMem_const
103 |             );
104 |             __syncthreads();
105 | 
106 |             for(int i = thread_id; i < state_size; i+=num_threads){
107 |                 d_c[i] = d_xu[i] - d_xs[i];
108 |             }
109 |             glass::copy<T>(states_sq, s_Qk, &d_G_dense[(states_sq+controls_sq)*k]);
110 |             glass::copy<T>(controls_sq, s_Rk, &d_G_dense[(states_sq+controls_sq)*k+states_sq]);
111 |             glass::copy<T>(states_sq, s_Qkp1, &d_G_dense[(states_sq+controls_sq)*(k+1)]);
112 |             glass::copy<T>(state_size, s_qk, &d_g[states_s_controls*k]);
113 |             glass::copy<T>(control_size, s_rk, &d_g[states_s_controls*k+state_size]);
114 |             glass::copy<T>(state_size, s_qkp1, &d_g[states_s_controls*(k+1)]);
115 |             glass::copy<T>(states_sq, static_cast<T>(-1), s_Ak, &d_C_dense[(states_sq+states_p_controls)*k]);
116 |             glass::copy<T>(states_p_controls, static_cast<T>(-1), s_Bk, &d_C_dense[(states_sq+states_p_controls)*k+states_sq]);
117 |             glass::copy<T>(state_size, s_integrator_error, &d_c[state_size*(k+1)]);
118 | 
119 |         }
120 |         else{                               // not last knot
121 | 
122 |             T *s_Ak = s_end;
123 |             T *s_Bk = s_Ak + states_sq;
124 |             T *s_integrator_error = s_Bk + states_p_controls;
125 |             T *s_extra_temp = s_integrator_error + state_size;
126 | 
127 |             integratorAndGradient<T, 
128 |                                   INTEGRATOR_TYPE, 
129 |                                   ANGLE_WRAP, 
130 |                                   true>
131 |                                  (state_size, control_size,
132 |                                   s_xux,
133 |                                   s_Ak,
134 |                                   s_Bk,
135 |                                   s_integrator_error,
136 |                                   s_extra_temp,
137 |                                   d_dynMem_const,
138 |                                   timestep,
139 |                                   block);
140 |             __syncthreads();
141 |            
142 |             gato_plant::trackingCostGradientAndHessian<T>(state_size,
143 |                                                   control_size,
144 |                                                   s_xux,
145 |                                                   s_eePos_traj,
146 |                                                   s_Qk,
147 |                                                   s_qk,
148 |                                                   s_Rk,
149 |                                                   s_rk,
150 |                                                   s_extra_temp,
151 |                                                   d_dynMem_const);
152 |             __syncthreads();
153 |  
154 |             glass::copy<T>(states_sq, s_Qk, &d_G_dense[(states_sq+controls_sq)*k]);
155 |             glass::copy<T>(controls_sq, s_Rk, &d_G_dense[(states_sq+controls_sq)*k+states_sq]);
156 |             glass::copy<T>(state_size, s_qk, &d_g[states_s_controls*k]);
157 |             glass::copy<T>(control_size, s_rk, &d_g[states_s_controls*k+state_size]);
158 |             glass::copy<T>(states_sq, static_cast<T>(-1), s_Ak, &d_C_dense[(states_sq+states_p_controls)*k]);
159 |             glass::copy<T>(states_p_controls, static_cast<T>(-1), s_Bk, &d_C_dense[(states_sq+states_p_controls)*k+states_sq]);
160 |             glass::copy<T>(state_size, s_integrator_error, &d_c[state_size*(k+1)]);
161 |         }
162 |     }
163 | }
164 | 


--------------------------------------------------------------------------------
/include/common/merit.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cstdint>
  4 | #include <cooperative_groups.h>
  5 | #include "dynamics/rbd_plant.cuh"
  6 | #include "integrator.cuh"
  7 | 
  8 | //TODO: this
  9 | template <typename T>
 10 | size_t get_merit_smem_size(uint32_t state_size, uint32_t control_size)
 11 | {
 12 |     return sizeof(T) * (6 + (2 * state_size + control_size ) + 
 13 |                         ((int) 1.5 * state_size) + gato_plant::forwardDynamics_TempMemSize_Shared());
 14 | }
 15 | 
 16 | // cost compute for line search
 17 | template <typename T>
 18 | __global__
 19 | void ls_gato_compute_merit(uint32_t state_size,
 20 |                            uint32_t control_size,
 21 |                            uint32_t knot_points,
 22 |                            T *d_xs,
 23 |                            T *d_xu, 
 24 |                            T *d_eePos_traj, 
 25 |                            T mu, 
 26 |                            T dt, 
 27 |                            void *d_dynMem_const, 
 28 |                            T *d_dz,
 29 |                            uint32_t alpha_multiplier, 
 30 |                            T *d_merits_out, 
 31 |                            T *d_merit_temp)
 32 | {
 33 | 
 34 |     grid::robotModel<T> *d_robotModel = (grid::robotModel<T> *)d_dynMem_const;
 35 |     const cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
 36 |     const uint32_t thread_id = threadIdx.x;
 37 |     const uint32_t num_threads = blockDim.x;
 38 |     const uint32_t block_id = blockIdx.x;
 39 |     const uint32_t num_blocks = gridDim.x;
 40 | 
 41 |     const uint32_t states_s_controls = state_size + control_size;
 42 | 
 43 |     extern __shared__ T s_xux_k[];
 44 | 
 45 |     T Jk, ck, pointmerit;
 46 | 
 47 |     T alpha = -1.0 / (1 << alpha_multiplier);   // alpha sign
 48 |     T *s_eePos_k_traj = s_xux_k + 2*state_size+control_size;
 49 |     T *s_temp = s_eePos_k_traj + 6;
 50 | 
 51 | 
 52 |     for(unsigned knot = block_id; knot < knot_points; knot += num_blocks){
 53 | 
 54 |         for(int i = thread_id; i < state_size+(knot < knot_points-1)*(states_s_controls); i+=num_threads){
 55 |             s_xux_k[i] = d_xu[knot*states_s_controls+i] + alpha * d_dz[knot*states_s_controls+i];  
 56 |             if (i < 6){
 57 |                 s_eePos_k_traj[i] = d_eePos_traj[knot*6+i];                            
 58 |             }
 59 |         }
 60 |         block.sync();
 61 |         
 62 |         Jk = gato_plant::trackingcost<T>(state_size, control_size, knot_points, s_xux_k, s_eePos_k_traj, s_temp, d_robotModel);
 63 |         
 64 |         block.sync();
 65 |         if(knot < knot_points-1){
 66 |             ck = integratorError<T>(state_size, s_xux_k, &s_xux_k[states_s_controls], s_temp, d_robotModel, dt, block);
 67 |         }
 68 |         else{
 69 |             // diff xs vs xs_traj
 70 |             for(int i = threadIdx.x; i < state_size; i++){
 71 |                 s_temp[i] = abs((d_xu[i] + alpha *d_dz[i]) - d_xs[i]);
 72 |             }
 73 |             block.sync();
 74 |             glass::reduce<T>(state_size, s_temp);
 75 |             block.sync();
 76 |             ck = s_temp[0];
 77 |         }
 78 |         block.sync();
 79 | 
 80 |         if(thread_id == 0){
 81 |             pointmerit = Jk + mu*ck;
 82 |             d_merit_temp[alpha_multiplier*knot_points+knot] = pointmerit;
 83 |             // printf("alpha: %f knot: %d reporting merit: %f\n", alpha, knot, pointmerit);
 84 |         }
 85 |     }
 86 |     cooperative_groups::this_grid().sync();
 87 |     if(block_id == 0){
 88 |         glass::reduce<T>(knot_points, &d_merit_temp[alpha_multiplier*knot_points]);
 89 |     
 90 |         if(thread_id == 0){
 91 |             d_merits_out[alpha_multiplier] = d_merit_temp[alpha_multiplier*knot_points];
 92 |         }
 93 |     }
 94 | }
 95 | 
 96 | // zero merit out
 97 | // shared mem size get_merit_smem_size()
 98 | // cost compute for non line search
 99 | template <typename T, unsigned INTEGRATOR_TYPE = 0, bool ANGLE_WRAP = false>
100 | __global__
101 | void compute_merit(uint32_t state_size, uint32_t control_size, uint32_t knot_points, T *d_xu, T *d_eePos_traj, T mu, T dt, void *d_dynMem_const, T *d_merit_out)
102 | {
103 |     grid::robotModel<T> *d_robotModel = (grid::robotModel<T> *)d_dynMem_const;
104 |     const cooperative_groups::thread_block block = cooperative_groups::this_thread_block();
105 |     const uint32_t thread_id = threadIdx.x;
106 |     const uint32_t num_threads = blockDim.x;
107 |     const uint32_t block_id = blockIdx.x;
108 | 
109 |     const uint32_t states_s_controls = state_size + control_size;
110 |     extern __shared__ T s_xux_k[];
111 | 
112 |     T Jk, ck, pointmerit;
113 |     T *s_eePos_k_traj = s_xux_k + 2 * state_size + control_size;
114 |     T *s_temp = s_eePos_k_traj + 6;
115 | 
116 |     for(unsigned knot = block_id; knot < knot_points; knot += gridDim.x){
117 | 
118 |         for(int i = thread_id; i < state_size+(knot < knot_points-1)*(states_s_controls); i+=num_threads){
119 |             s_xux_k[i] = d_xu[knot*states_s_controls+i];  
120 |             if (i < 6){
121 |                 s_eePos_k_traj[i] = d_eePos_traj[knot*6+i];                            
122 |             }
123 |         }
124 | 
125 |         block.sync();
126 |         Jk = gato_plant::trackingcost<T>(state_size, control_size, knot_points, s_xux_k, s_eePos_k_traj, s_temp, d_robotModel);
127 | 
128 | 
129 |         block.sync();
130 |         if(knot < knot_points-1){
131 |             ck = integratorError<T>(state_size, s_xux_k, &s_xux_k[states_s_controls], s_temp, d_robotModel, dt, block);
132 |         }
133 |         else{
134 |             ck = 0;
135 |         }
136 |         block.sync();
137 | 
138 |         if(thread_id == 0){
139 |             pointmerit = Jk + mu*ck;
140 |             atomicAdd(d_merit_out, pointmerit);
141 |         }
142 |     }
143 | }
144 | 


--------------------------------------------------------------------------------
/include/common/settings.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | 
  4 | 
  5 | #ifndef KNOT_POINTS
  6 | #define KNOT_POINTS 32 
  7 | #endif
  8 | 
  9 | // default value is for iiwa arm 
 10 | #ifndef STATE_SIZE
 11 | #define STATE_SIZE  14
 12 | #endif
 13 | 
 14 | 
 15 | /*******************************************************************************
 16 |  *                           Print Settings                               *
 17 |  *******************************************************************************/
 18 | 
 19 | 
 20 | #ifndef LIVE_PRINT_PATH
 21 | #define LIVE_PRINT_PATH 0
 22 | #endif 
 23 | 
 24 | #ifndef LIVE_PRINT_STATS
 25 | #define LIVE_PRINT_STATS 0
 26 | #endif
 27 | 
 28 | /*******************************************************************************
 29 |  *                           Test Settings                               *
 30 |  *******************************************************************************/
 31 | 
 32 | 
 33 | #ifndef TEST_ITERS
 34 | #define TEST_ITERS 1
 35 | #endif
 36 | 
 37 | #ifndef SAVE_DATA
 38 | #define SAVE_DATA   0
 39 | #endif 
 40 | 
 41 | #ifndef USE_DOUBLES
 42 | #define USE_DOUBLES 0
 43 | #endif
 44 | 
 45 | #if USE_DOUBLES
 46 | typedef double linsys_t;
 47 | #else
 48 | typedef float linsys_t;
 49 | #endif
 50 | 
 51 | /*******************************************************************************
 52 |  *                           MPC Settings                               *
 53 |  *******************************************************************************/
 54 | 
 55 | 
 56 | #ifndef CONST_UPDATE_FREQ
 57 | #define CONST_UPDATE_FREQ 1
 58 | #endif
 59 | 
 60 | // runs sqp a bunch of times before starting to track
 61 | #ifndef REMOVE_JITTERS
 62 | #define REMOVE_JITTERS  1
 63 | #endif
 64 | 
 65 | // this constant controls when xu and goal will be shifted, should be a fraction of a timestep
 66 | #ifndef SHIFT_THRESHOLD
 67 | #define SHIFT_THRESHOLD (1 * timestep)
 68 | #endif
 69 | 
 70 | #ifndef SIMULATION_PERIOD
 71 | #define SIMULATION_PERIOD 2000
 72 | #endif
 73 | 
 74 | #ifndef MERIT_THREADS
 75 | #define MERIT_THREADS       128
 76 | #endif 
 77 | 
 78 | // when enabled ABSOLUTE_QD_PENALTY penalizes qd like controls, rather than penalizing relative distance to precomputed traj
 79 | #ifndef ABSOLUTE_QD_PENALTY
 80 | #define ABSOLUTE_QD_PENALTY 0
 81 | #endif 
 82 | 
 83 | 
 84 | #ifndef R_COST
 85 | 	#if KNOT_POINTS == 64
 86 | #define R_COST .001 
 87 | 	#else 
 88 | #define R_COST .0001 
 89 | 	#endif
 90 | #endif
 91 | 
 92 | #ifndef QD_COST
 93 | #define QD_COST .0001 
 94 | #endif
 95 | 
 96 | 
 97 | 
 98 | /*******************************************************************************
 99 |  *                           Linsys Settings                               *
100 |  *******************************************************************************/
101 | 
102 | 
103 | /* time_linsys = 1 to record linear system solve times. 
104 | time_linsys = 0 to record number of sqp iterations. 
105 | In both cases, the tracking error will also be recorded. */
106 |     
107 | #ifndef TIME_LINSYS
108 | #define TIME_LINSYS 1
109 | #endif
110 | 
111 | #ifndef PCG_NUM_THREADS
112 | #define PCG_NUM_THREADS	128
113 | #endif
114 | 
115 | 
116 | /* LINSYS_SOLVE = 1 uses pcg as the underlying linear system solver
117 | LINSYS_SOLVE = 0 uses qdldl as the underlying linear system solver */
118 | 
119 | #ifndef LINSYS_SOLVE
120 | #define LINSYS_SOLVE 1 
121 | #endif
122 | 
123 | // Values found using experiments
124 | #ifndef PCG_MAX_ITER
125 | 	#if LINSYS_SOLVE
126 | 		#if KNOT_POINTS == 32
127 | #define PCG_MAX_ITER 173 
128 | 		#elif KNOT_POINTS == 64
129 | #define PCG_MAX_ITER 167
130 | 		#elif KNOT_POINTS == 128
131 | #define PCG_MAX_ITER 167
132 | 		#elif KNOT_POINTS == 256
133 | #define PCG_MAX_ITER 118
134 | 		#elif KNOT_POINTS == 512
135 | #define PCG_MAX_ITER 67
136 | 		#else
137 | #define PCG_MAX_ITER 200	
138 | 		#endif	
139 | 	#else 
140 | #define PCG_MAX_ITER -1
141 | #define PCG_EXIT_TOL -1 
142 | 	#endif
143 | 
144 | #endif
145 | 
146 | 
147 | /*******************************************************************************
148 |  *                           SQP Settings                               *
149 |  *******************************************************************************/
150 | 
151 | 
152 | #if TIME_LINSYS == 1
153 |     #define SQP_MAX_ITER    20
154 |     typedef double toplevel_return_type;
155 | #else
156 |     #define SQP_MAX_ITER    40
157 |     typedef uint32_t toplevel_return_type;
158 | #endif
159 | 
160 | 
161 | #ifndef SQP_MAX_TIME_US
162 | #define SQP_MAX_TIME_US 2000 
163 | #endif
164 | 
165 | #ifndef SCHUR_THREADS
166 | #define SCHUR_THREADS       128
167 | #endif 
168 | 
169 | #ifndef DZ_THREADS
170 | #define DZ_THREADS          128
171 | #endif 
172 | 
173 | #ifndef KKT_THREADS
174 | #define KKT_THREADS         128
175 | #endif
176 | 
177 | 
178 | 
179 | /*******************************************************************************
180 |  *                           Rho Settings                               *
181 |  *******************************************************************************/
182 | 
183 | 
184 | 
185 | #ifndef RHO_MIN
186 | #define RHO_MIN 1e-3
187 | #endif
188 | 
189 | //TODO: get rid of rho in defines
190 | #ifndef RHO_FACTOR
191 | #define RHO_FACTOR 1.2 
192 | #endif
193 | 
194 | #ifndef RHO_MAX
195 | #define RHO_MAX 10 
196 | #endif
197 | 
198 | 
199 | 
200 | 


--------------------------------------------------------------------------------
/include/dynamics/iiwa/iiwa_eepos_plant.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | // // values assumed coming from an instance of grid
  3 | // namespace grid{
  4 | // 	//
  5 | // 	// TODO do I need all of these?
  6 | // 	//
  7 | 
  8 | // 	const int NUM_JOINTS = 30;
  9 | //     const int ID_DYNAMIC_SHARED_MEM_COUNT = 2340;
 10 | //     const int MINV_DYNAMIC_SHARED_MEM_COUNT = 9210;
 11 | //     const int FD_DYNAMIC_SHARED_MEM_COUNT = 10110;
 12 | //     const int ID_DU_DYNAMIC_SHARED_MEM_COUNT = 10980;
 13 | //     const int FD_DU_DYNAMIC_SHARED_MEM_COUNT = 10980;
 14 | //     const int ID_DU_MAX_SHARED_MEM_COUNT = 13410;
 15 | //     const int FD_DU_MAX_SHARED_MEM_COUNT = 16140;
 16 | //     const int SUGGESTED_THREADS = 512;
 17 | 
 18 | // 	template <typename T>
 19 | //     struct robotModel {
 20 | //         T *d_XImats;
 21 | //         int *d_topology_helpers;
 22 | //     };
 23 | // }
 24 | 
 25 | #include <stdio.h>
 26 | #include <cuda.h> 
 27 | #include <cuda_runtime.h>
 28 | #include <cuda_runtime_api.h>
 29 | #include <cooperative_groups.h>
 30 | #include "iiwa_eepos_grid.cuh"
 31 | #include "settings.cuh"
 32 | 
 33 | #include "glass.cuh"
 34 | 
 35 | // #include <random>
 36 | // #define RANDOM_MEAN 0
 37 | // #define RANDOM_STDEV 0.001
 38 | // std::default_random_engine randEng(time(0)); //seed
 39 | // std::normal_distribution<double> randDist(RANDOM_MEAN, RANDOM_STDEV); //mean followed by stdiv
 40 | 
 41 | namespace gato_plant{
 42 | 
 43 | 
 44 | 	const unsigned SUGGESTED_THREADS = grid::SUGGESTED_THREADS;
 45 | 
 46 | 	template<class T>
 47 | 	__host__ __device__
 48 | 	constexpr T PI() {return static_cast<T>(3.14159);}
 49 | 	template<class T>
 50 | 	__host__ __device__
 51 | 	constexpr T GRAVITY() {return static_cast<T>(0.0);}
 52 | 
 53 | 
 54 | 	// template<class T>
 55 | 	// __host__ __device__
 56 | 	// constexpr T COST_Q1() {return static_cast<T>(Q_COST);}
 57 | 	
 58 | 	template<class T>
 59 | 	__host__ __device__
 60 | 	constexpr T COST_QD() {return static_cast<T>(QD_COST);}
 61 | 
 62 | 	template<class T>
 63 | 	__host__ __device__
 64 | 	constexpr T COST_R() {return static_cast<T>(R_COST);}
 65 | 
 66 | 	template <typename T>
 67 | 	void *initializeDynamicsConstMem(){
 68 | 		grid::robotModel<T> *d_robotModel = grid::init_robotModel<T>();
 69 | 		return (void *)d_robotModel;
 70 | 	}
 71 | 	template <typename T>
 72 | 	void freeDynamicsConstMem(void *d_dynMem_const){
 73 | 		grid::free_robotModel((grid::robotModel<T>*) d_dynMem_const);
 74 | 	}
 75 | 
 76 | 	// Start at q = [0,0,-0.25*PI,0,0.25*PI,0.5*PI,0] with small random for qd, u, lambda
 77 | 	// template <typename T>
 78 | 	// __host__
 79 | 	// void loadInitialState(T *x){
 80 | 	// 	T q[7] = {PI<T>(),0.25*PI<T>(),0.167*PI<T>(),-0.167*PI<T>(),PI<T>(),0.167*PI<T>(),0.5*PI<T>()};
 81 | 	// 	for (int i = 0; i < 7; i++){
 82 | 	// 		x[i] = q[i]; x[i + 7] = 0;
 83 | 	// 	}
 84 | 	// }
 85 | 
 86 | 	// template <typename T>
 87 | 	// __host__
 88 | 	// void loadInitialControl(T *u){for (int i = 0; i < 7; i++){u[i] = 0;}}
 89 | 
 90 | 	// // goal at q = [-0.5*PI,0.25*PI,0.167*PI,-0.167*PI,0.125*PI,0.167*PI,0.5*PI] with 0 for qd, u, lambda
 91 | 	// template <typename T>
 92 | 	// __host__
 93 | 	// void loadGoalState(T *xg){
 94 | 	// 	T q[7] = {0,0,-0.25*PI<T>(),0,0.25*PI<T>(),0.5*PI<T>(),0};
 95 | 	// 	for (int i = 0; i < 7; i++){
 96 | 	// 		xg[i] = q[i]; xg[i + 7] = static_cast<T>(0);
 97 | 	// 	}
 98 | 	// }
 99 | 
100 | 	template <typename T>
101 | 	__device__
102 | 	void forwardDynamics(T *s_qdd, T *s_q, T *s_qd, T *s_u, T *s_XITemp, void *d_dynMem_const, cooperative_groups::thread_block block){
103 | 
104 | 		T *s_XImats = s_XITemp; T *s_temp = &s_XITemp[1008];
105 |     	grid::load_update_XImats_helpers<T>(s_XImats, s_q, (grid::robotModel<float> *) d_dynMem_const, s_temp);
106 |     	__syncthreads();
107 | 
108 |     	grid::forward_dynamics_inner<T>(s_qdd, s_q, s_qd, s_u, s_XImats, s_temp, gato_plant::GRAVITY<T>());
109 | 		
110 | 		// grid::forward_dynamics_device<T>(s_qdd,s_q,s_qd,s_u,(grid::robotModel<T>*)d_dynMem_const,GRAVITY<T>());
111 | 	}
112 | 
113 | 	__host__ __device__
114 | 	constexpr unsigned forwardDynamics_TempMemSize_Shared(){return grid::FD_DYNAMIC_SHARED_MEM_COUNT;}
115 | 
116 | 	// template <typename T>
117 | 	// __device__
118 | 	// void forwardDynamicsGradient( T *s_dqdd, T *s_q, T *s_qd, T *s_u, T *s_temp, void *d_dynMem_const, cooperative_groups::thread_block block){
119 | 	// 	grid::forward_dynamics_gradient_device<T,true>(s_dqdd, s_q, s_qd, s_u, s_temp, (grid::robotModel<T> *)d_dynMem_const,GRAVITY<T>());
120 | 	// }
121 | 
122 | 	// __host__ __device__
123 | 	// constexpr unsigned forwardDynamicsGradient_TempMemSize_Shared(){return grid::FD_DU_MAX_SHARED_MEM_COUNT;}
124 | 
125 | 
126 |     template <typename T, bool INCLUDE_DU = true>
127 |     __device__
128 |     void forwardDynamicsAndGradient(T *s_df_du, T *s_qdd, const T *s_q, const T *s_qd, const T *s_u, T *s_temp_in, void *d_dynMem_const){
129 | 
130 | 		T *s_XITemp = s_temp_in;
131 | 		grid::robotModel<T> *d_robotModel = (grid::robotModel<T> *) d_dynMem_const;
132 | 
133 |         T *s_XImats = s_XITemp; T *s_vaf = &s_XITemp[504]; T *s_dc_du = &s_vaf[126]; T *s_Minv = &s_dc_du[98]; T *s_temp = &s_Minv[49];
134 |         grid::load_update_XImats_helpers<T>(s_XImats, s_q, d_robotModel, s_temp); __syncthreads();
135 |         //TODO: there is a slightly faster way as s_v does not change -- thus no recompute needed
136 |         grid::direct_minv_inner<T>(s_Minv, s_q, s_XImats, s_temp); __syncthreads();
137 |         T *s_c = s_temp;
138 |         grid::inverse_dynamics_inner<T>(s_c, s_vaf, s_q, s_qd, s_XImats, &s_temp[7], GRAVITY<T>()); __syncthreads();
139 |         grid::forward_dynamics_finish<T>(s_qdd, s_u, s_c, s_Minv); __syncthreads();
140 |         grid::inverse_dynamics_inner_vaf<T>(s_vaf, s_q, s_qd, s_qdd, s_XImats, s_temp, GRAVITY<T>()); __syncthreads();
141 |         grid::inverse_dynamics_gradient_inner<T>(s_dc_du, s_q, s_qd, s_vaf, s_XImats, s_temp, GRAVITY<T>()); __syncthreads();
142 |         for(int ind = threadIdx.x + threadIdx.y*blockDim.x; ind < 98; ind += blockDim.x*blockDim.y){
143 |             int row = ind % 7; int dc_col_offset = ind - row;
144 |             // account for the fact that Minv is an SYMMETRIC_UPPER triangular matrix
145 |             T val = static_cast<T>(0);
146 |             for(int col = 0; col < 7; col++) {
147 |                 int index = (row <= col) * (col * 7 + row) + (row > col) * (row * 7 + col);
148 |                 val += s_Minv[index] * s_dc_du[dc_col_offset + col];
149 |             }
150 |             s_df_du[ind] = -val;
151 |             if (INCLUDE_DU && ind < 49){
152 |                 int col = ind / 7; int index = (row <= col) * (col * 7 + row) + (row > col) * (row * 7 + col);
153 |                 s_df_du[ind + 98] = s_Minv[index];
154 |             }
155 |         }
156 |     }
157 | 
158 | 
159 | 	// template <typename T>
160 | 	// __device__
161 |     // void forwardDynamicsAndGradient(T *s_dqdd, T *s_qdd, T *s_q, T *s_qd, T *s_u,  T *s_temp_in, void *d_dynMem_const, cooperative_groups::thread_block block){
162 |        
163 | 		// grid::robotModel<T> *d_robotModel = (grid::robotModel<T> *) d_dynMem_const;
164 | 		
165 | 		// T *s_dc_du = s_temp_in;
166 | 		// T *s_vaf = s_dc_du + 392;
167 | 		// T *s_Minv = s_vaf + 252;
168 | 		// T *s_XITemp = s_Minv + 196;
169 | 		// T *s_XImats = s_XITemp; T *s_temp = &s_XITemp[1008];
170 | 
171 | 
172 | 	    // grid::load_update_XImats_helpers<T>(s_XImats, s_q, d_robotModel, s_temp);
173 | 		
174 | 		// grid::direct_minv_inner<T>(s_Minv, s_q, s_XImats, s_temp);
175 | 		// grid::inverse_dynamics_inner<T>(s_temp, s_vaf, s_q, s_qd, s_XImats, &s_temp[14], GRAVITY<T>());
176 | 		// grid::forward_dynamics_finish<T>(s_qdd, s_u, s_temp, s_Minv);
177 | 		
178 | 		// grid::inverse_dynamics_inner_vaf<T>(s_vaf, s_q, s_qd, s_qdd, s_XImats, s_temp, GRAVITY<T>());
179 | 		// grid::inverse_dynamics_gradient_inner<T>(s_dc_du, s_q, s_qd, s_vaf, s_XImats, s_temp, GRAVITY<T>());
180 | 		// for(int ind = threadIdx.x; ind < 392; ind += blockDim.x){
181 | 		// 	int row = ind % 14; int dc_col_offset = ind - row;
182 | 		// 	// account for the fact that Minv is an SYMMETRIC_UPPER triangular matrix
183 | 		// 	T val = static_cast<T>(0);
184 | 		// 	for(int col = 0; col < 14; col++) {
185 | 		// 		int index = (row <= col) * (col * 14 + row) + (row > col) * (row * 14 + col);
186 | 		// 		val += s_Minv[index] * s_dc_du[dc_col_offset + col];
187 | 		// 	}
188 | 		// 	s_temp[ind] = -val;
189 | 		// }
190 | 
191 | 		// for(int ind = threadIdx.x; ind < 392; ind += blockDim.x){
192 | 		// 	s_dqdd[ind] = s_temp[ind];
193 | 		// }
194 | 		// __syncthreads();
195 | 		
196 | 
197 | 		// T *s_XITemp = s_temp_in;
198 | 		// grid::robotModel<T> *d_robotModel = (grid::robotModel<T> *) d_dynMem_const;
199 | 		// T *s_XImats = s_XITemp; T *s_vaf = &s_XITemp[504]; T *s_dc_du = &s_vaf[126]; T *s_Minv = &s_dc_du[98]; T *s_temp = &s_Minv[49];
200 |         // grid::load_update_XImats_helpers<T>(s_XImats, s_q, d_robotModel, s_temp); __syncthreads();
201 |         // //TODO: there is a slightly faster way as s_v does not change -- thus no recompute needed
202 |         // grid::direct_minv_inner<T>(s_Minv, s_q, s_XImats, s_temp); __syncthreads();
203 |         // T *s_c = s_temp;
204 |         // grid::inverse_dynamics_inner<T>(s_c, s_vaf, s_q, s_qd, s_XImats, &s_temp[7], GRAVITY<T>()); __syncthreads();
205 |         // grid::forward_dynamics_finish<T>(s_qdd, s_u, s_c, s_Minv); __syncthreads();
206 |         // grid::inverse_dynamics_inner_vaf<T>(s_vaf, s_q, s_qd, s_qdd, s_XImats, s_temp, GRAVITY<T>()); __syncthreads();
207 |         // grid::inverse_dynamics_gradient_inner<T>(s_dc_du, s_q, s_qd, s_vaf, s_XImats, s_temp, GRAVITY<T>()); __syncthreads();
208 |         // for(int ind = threadIdx.x + threadIdx.y*blockDim.x; ind < 98; ind += blockDim.x*blockDim.y){
209 |         //     int row = ind % 7; int dc_col_offset = ind - row;
210 |         //     // account for the fact that Minv is an SYMMETRIC_UPPER triangular matrix
211 |         //     T val = static_cast<T>(0);
212 |         //     for(int col = 0; col < 7; col++) {
213 |         //         int index = (row <= col) * (col * 7 + row) + (row > col) * (row * 7 + col);
214 |         //         val += s_Minv[index] * s_dc_du[dc_col_offset + col];
215 |         //     }
216 |         //     s_dqdd[ind] = -val;
217 |         //     if (1 && ind < 49){
218 |         //         int col = ind / 7; int index = (row <= col) * (col * 7 + row) + (row > col) * (row * 7 + col);
219 |         //         s_dqdd[ind + 98] = s_Minv[index];
220 |         //     }
221 |         // }
222 | 
223 | 
224 | 
225 | 		// grid::robotModel<T> *d_robotModel = (grid::robotModel<T> *) d_dynMem_const;
226 | 		// grid::forward_dynamics_gradient_device<T>(s_dqdd, s_q, s_qd, s_u, d_robotModel, GRAVITY<T>());
227 |     // }
228 | 
229 | 
230 | 	__host__ __device__
231 | 	constexpr unsigned forwardDynamicsAndGradient_TempMemSize_Shared(){return grid::FD_DU_MAX_SHARED_MEM_COUNT;}
232 | 
233 | 
234 | 	__host__
235 | 	unsigned trackingcost_TempMemCt_Shared(uint32_t state_size, uint32_t control_size, uint32_t knot_points){
236 | 		return state_size/2 + control_size + 3 + 6 + grid::EE_POS_SHARED_MEM_COUNT;
237 | 	}
238 | 
239 | 	///TODO: get rid of divergence
240 | 		template <typename T>
241 | 	__device__
242 | 	T trackingcost(uint32_t state_size, uint32_t control_size, uint32_t knot_points, T *s_xu, T *s_eePos_traj, T *s_temp, const grid::robotModel<T> *d_robotModel){
243 | 		
244 |         // const T Q_cost = COST_Q1<T>();
245 | 		const T QD_cost = COST_QD<T>();
246 | 		const T R_cost = COST_R<T>();
247 |         
248 |         T err;
249 |         T val = 0;
250 | 		
251 |         // QD and R penalty
252 | 		const uint32_t threadsNeeded = state_size/2 + control_size * (blockIdx.x < knot_points - 1);
253 |         
254 | 		T *s_cost_vec = s_temp;
255 | 		T *s_eePos_cost = s_cost_vec + threadsNeeded + 3;
256 |         T *s_extra_temp = s_eePos_cost + 6;
257 | 
258 | 
259 | 
260 | 
261 |         for(int i = threadIdx.x; i < threadsNeeded; i += blockDim.x){
262 | 			if(i < state_size/2){
263 |                 err = s_xu[i + state_size/2];
264 |                 val = QD_cost * err * err;
265 | 			}
266 | 			else{
267 | 				err = s_xu[i+state_size/2];
268 | 				val = R_cost * err * err;
269 | 			}
270 | 			s_cost_vec[i] = static_cast<T>(0.5) * val;
271 | 		}
272 | 
273 |         __syncthreads();
274 |         grid::end_effector_positions_device<T>(s_eePos_cost, s_xu, s_extra_temp, d_robotModel);
275 |         __syncthreads();
276 |         
277 | 		// if(threadIdx.x==0){
278 | 		// 	printf("block %d with input %f,%f,%f,%f,%f,%f,%f\n", blockIdx.x, s_xu[7],s_xu[8],s_xu[9],s_xu[10],s_xu[11],s_xu[12],s_xu[13]);
279 | 		// }
280 | 
281 |         for(int i = threadIdx.x; i < 3; i+=blockDim.x){
282 |             err = s_eePos_cost[i] - s_eePos_traj[i];
283 |             s_cost_vec[threadsNeeded + i] = static_cast<T>(0.5) * err * err;
284 |         }
285 | 		__syncthreads();
286 | 		glass::reduce<T>(3 + threadsNeeded, s_cost_vec);
287 | 		__syncthreads();
288 | 		
289 |         return s_cost_vec[0];
290 | 	}	
291 | 
292 | 
293 | 	///TODO: costgradientandhessian could be much faster with no divergence
294 | 	// not last block
295 | 	template <typename T, bool computeR=true>
296 | 	__device__
297 | 	void trackingCostGradientAndHessian(uint32_t state_size, 
298 | 										uint32_t control_size, 
299 | 										T *s_xu, 
300 | 										T *s_eePos_traj, 
301 | 										T *s_Qk, 
302 | 										T *s_qk, 
303 | 										T *s_Rk, 
304 | 										T *s_rk,
305 | 										T *s_temp,
306 | 										void *d_robotModel)
307 | 	{	
308 | 		// const T Q_cost = COST_Q1<T>();
309 | 		const T QD_cost = COST_QD<T>();
310 | 		const T R_cost = COST_R<T>();
311 | 
312 | 		T *s_eePos = s_temp;
313 | 		T *s_eePos_grad = s_eePos + 6;
314 | 		T *s_scratch = s_eePos_grad + 6 * state_size/2;
315 | 
316 | 		const uint32_t threads_needed = state_size + control_size*computeR;
317 | 		uint32_t offset;
318 | 		T x_err, y_err, z_err, err;
319 | 
320 | 		grid::end_effector_positions_device<T>(s_eePos, s_xu, s_scratch, (grid::robotModel<T> *)d_robotModel);
321 |         __syncthreads();
322 | 		grid::end_effector_positions_gradient_device<T>(s_eePos_grad, s_xu, s_scratch, (grid::robotModel<T> *)d_robotModel);
323 |         __syncthreads();
324 | 
325 | 		// if(threadIdx.x==0){
326 | 		// 	printf("block %d with input %f,%f,%f,%f,%f,%f,%f\n", blockIdx.x, s_xu[0],s_xu[1],s_xu[2],s_xu[3],s_xu[4],s_xu[5],s_xu[6]);
327 | 		// }
328 | 
329 | 		for (int i = threadIdx.x; i < threads_needed; i += blockDim.x){
330 | 			
331 | 			if(i < state_size){
332 | 				//gradient
333 | 				if (i < state_size / 2){
334 | 					// sum x, y, z error
335 | 					x_err = (s_eePos[0] - s_eePos_traj[0]);
336 | 					y_err = (s_eePos[1] - s_eePos_traj[1]);
337 | 					z_err = (s_eePos[2] - s_eePos_traj[2]);
338 | 
339 | 					s_qk[i] = s_eePos_grad[6 * i + 0] * x_err + s_eePos_grad[6 * i + 1] * y_err + s_eePos_grad[6 * i + 2] * z_err;
340 | 				}
341 | 				else{
342 | 					err = s_xu[i];
343 | 					s_qk[i] = QD_cost * err;
344 | 				}
345 | 				
346 | 			}
347 | 			else{
348 | 				err = s_xu[i];
349 | 				offset = i - state_size;
350 | 				
351 | 				//gradient
352 | 				s_rk[offset] = R_cost * err;
353 | 			}
354 | 		}
355 | 
356 | 		__syncthreads();
357 | 
358 | 		for (int i = threadIdx.x; i < threads_needed; i += blockDim.x){
359 | 			if (i < state_size){
360 | 				//hessian
361 | 				for(int j = 0; j < state_size; j++){
362 | 					if(j < state_size / 2 && i < state_size / 2){
363 | 						s_Qk[i*state_size + j] = s_qk[i] * s_qk[j];
364 | 					}
365 | 					else{
366 | 						s_Qk[i*state_size + j] = (i == j) ? QD_cost : static_cast<T>(0);
367 | 					}
368 | 				}
369 | 			}
370 | 			else{
371 | 				offset = i - state_size;
372 | 				//hessian
373 | 				for(int j = 0; j < control_size; j++){
374 | 					s_Rk[offset*control_size+j] = (offset == j) ? R_cost : static_cast<T>(0);
375 | 				}
376 | 			}
377 | 		}
378 | 	}
379 | 
380 | 	// last block
381 | 	template <typename T>
382 | 	__device__
383 | 	void trackingCostGradientAndHessian_lastblock(uint32_t state_size, 
384 | 							    				  uint32_t control_size, 
385 | 							    				  T *s_xux, 
386 | 							    				  T *s_eePos_traj, 
387 | 							    				  T *s_Qk, 
388 | 							    				  T *s_qk, 
389 | 							    				  T *s_Rk, 
390 | 							    				  T *s_rk, 
391 | 							    				  T *s_Qkp1, 
392 | 							    				  T *s_qkp1,
393 | 							    				  T *s_temp,
394 | 												  void *d_dynMem_const
395 | 												  )
396 | 	{
397 | 		trackingCostGradientAndHessian<T>(state_size, control_size, s_xux, s_eePos_traj, s_Qk, s_qk, s_Rk, s_rk, s_temp, d_dynMem_const);
398 | 		__syncthreads();
399 | 		trackingCostGradientAndHessian<T, false>(state_size, control_size, s_xux, &s_eePos_traj[6], s_Qkp1, s_qkp1, nullptr, nullptr, s_temp, d_dynMem_const);
400 | 		__syncthreads();
401 | 	}
402 | 
403 | 	// __host__ __device__
404 | 	// constexpr unsigned costGradientAndHessian_TempMemSize_Shared(){return 0;}
405 | }
406 | 
407 | 


--------------------------------------------------------------------------------
/include/dynamics/iiwa/iiwa_plant.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | // // values assumed coming from an instance of grid
  3 | // namespace grid{
  4 | // 	//
  5 | // 	// TODO do I need all of these?
  6 | // 	//
  7 | 
  8 | // 	const int NUM_JOINTS = 30;
  9 | //     const int ID_DYNAMIC_SHARED_MEM_COUNT = 2340;
 10 | //     const int MINV_DYNAMIC_SHARED_MEM_COUNT = 9210;
 11 | //     const int FD_DYNAMIC_SHARED_MEM_COUNT = 10110;
 12 | //     const int ID_DU_DYNAMIC_SHARED_MEM_COUNT = 10980;
 13 | //     const int FD_DU_DYNAMIC_SHARED_MEM_COUNT = 10980;
 14 | //     const int ID_DU_MAX_SHARED_MEM_COUNT = 13410;
 15 | //     const int FD_DU_MAX_SHARED_MEM_COUNT = 16140;
 16 | //     const int SUGGESTED_THREADS = 512;
 17 | 
 18 | // 	template <typename T>
 19 | //     struct robotModel {
 20 | //         T *d_XImats;
 21 | //         int *d_topology_helpers;
 22 | //     };
 23 | // }
 24 | 
 25 | #include <stdio.h>
 26 | #include <cuda.h> 
 27 | #include <cuda_runtime.h>
 28 | #include <cuda_runtime_api.h>
 29 | #include <cooperative_groups.h>
 30 | #include "iiwa_grid.cuh"
 31 | #include "../settings.cuh"
 32 | 
 33 | #include "glass.cuh"
 34 | 
 35 | // #include <random>
 36 | // #define RANDOM_MEAN 0
 37 | // #define RANDOM_STDEV 0.001
 38 | // std::default_random_engine randEng(time(0)); //seed
 39 | // std::normal_distribution<double> randDist(RANDOM_MEAN, RANDOM_STDEV); //mean followed by stdiv
 40 | 
 41 | namespace gato_plant{
 42 | 
 43 | 
 44 | 	const unsigned SUGGESTED_THREADS = grid::SUGGESTED_THREADS;
 45 | 
 46 | 	template<class T>
 47 | 	__host__ __device__
 48 | 	constexpr T PI() {return static_cast<T>(3.14159);}
 49 | 	template<class T>
 50 | 	__host__ __device__
 51 | 	constexpr T GRAVITY() {return static_cast<T>(0.0);}
 52 | 
 53 | 
 54 | 	template<class T>
 55 | 	__host__ __device__
 56 | 	constexpr T COST_Q1() {return static_cast<T>(Q_COST);}
 57 | 	
 58 | 	template<class T>
 59 | 	__host__ __device__
 60 | 	constexpr T COST_QD() {return static_cast<T>(QD_COST);}
 61 | 
 62 | 	template<class T>
 63 | 	__host__ __device__
 64 | 	constexpr T COST_R() {return static_cast<T>(R_COST);}
 65 | 
 66 | 	template <typename T>
 67 | 	void *initializeDynamicsConstMem(){
 68 | 		grid::robotModel<T> *d_robotModel = grid::init_robotModel<T>();
 69 | 		return (void *)d_robotModel;
 70 | 	}
 71 | 	template <typename T>
 72 | 	void freeDynamicsConstMem(void *d_dynMem_const){
 73 | 		grid::free_robotModel((grid::robotModel<T>*) d_dynMem_const);
 74 | 	}
 75 | 
 76 | 	// Start at q = [0,0,-0.25*PI,0,0.25*PI,0.5*PI,0] with small random for qd, u, lambda
 77 | 	template <typename T>
 78 | 	__host__
 79 | 	void loadInitialState(T *x){
 80 | 		T q[7] = {PI<T>(),0.25*PI<T>(),0.167*PI<T>(),-0.167*PI<T>(),PI<T>(),0.167*PI<T>(),0.5*PI<T>()};
 81 | 		for (int i = 0; i < 7; i++){
 82 | 			x[i] = q[i]; x[i + 7] = 0;
 83 | 		}
 84 | 	}
 85 | 
 86 | 	template <typename T>
 87 | 	__host__
 88 | 	void loadInitialControl(T *u){for (int i = 0; i < 7; i++){u[i] = 0;}}
 89 | 
 90 | 	// goal at q = [-0.5*PI,0.25*PI,0.167*PI,-0.167*PI,0.125*PI,0.167*PI,0.5*PI] with 0 for qd, u, lambda
 91 | 	template <typename T>
 92 | 	__host__
 93 | 	void loadGoalState(T *xg){
 94 | 		T q[7] = {0,0,-0.25*PI<T>(),0,0.25*PI<T>(),0.5*PI<T>(),0};
 95 | 		for (int i = 0; i < 7; i++){
 96 | 			xg[i] = q[i]; xg[i + 7] = static_cast<T>(0);
 97 | 		}
 98 | 	}
 99 | 
100 | 	template <typename T>
101 | 	__device__
102 | 	void forwardDynamics(T *s_qdd, T *s_q, T *s_qd, T *s_u, T *s_temp, void *d_dynMem_const, cooperative_groups::thread_block block){
103 | 		grid::forward_dynamics_device<T>(s_qdd,s_q,s_qd,s_u,s_temp,(grid::robotModel<T>*)d_dynMem_const,GRAVITY<T>());
104 | 	}
105 | 
106 | 	__host__ __device__
107 | 	constexpr unsigned forwardDynamics_TempMemSize_Shared(){return grid::FD_DYNAMIC_SHARED_MEM_COUNT;}
108 | 
109 | 	template <typename T>
110 | 	__device__
111 | 	void forwardDynamicsGradient( T *s_dqdd, T *s_q, T *s_qd, T *s_u, T *s_temp, void *d_dynMem_const, cooperative_groups::thread_block block){
112 | 		grid::forward_dynamics_gradient_device<T,true>(s_dqdd, s_q, s_qd, s_u, s_temp, (grid::robotModel<T> *)d_dynMem_const,GRAVITY<T>());
113 | 	}
114 | 
115 | 	__host__ __device__
116 | 	constexpr unsigned forwardDynamicsGradient_TempMemSize_Shared(){return grid::FD_DU_MAX_SHARED_MEM_COUNT_new_version;}
117 | 
118 | 	template <typename T>
119 | 	__device__
120 |     void forwardDynamicsAndGradient(T *s_dqdd, T *s_qdd, T *s_q, T *s_qd, T *s_u,  T *s_temp, void *d_dynMem_const, cooperative_groups::thread_block block){
121 |         grid::forward_dynamics_and_gradient_device<T,true>(s_dqdd, s_qdd, s_q, s_qd, s_u, s_temp, (grid::robotModel<T> *)d_dynMem_const,GRAVITY<T>());
122 |     }
123 | 
124 | 
125 | 	__host__ __device__
126 | 	constexpr unsigned forwardDynamicsAndGradient_TempMemSize_Shared(){return grid::FD_DU_MAX_SHARED_MEM_COUNT_new_version;}
127 | 
128 | 
129 | 	///TODO: get rid of divergence
130 | 	template <typename T>
131 | 	__device__
132 | 	T trackingcost(uint32_t state_size, uint32_t control_size, uint32_t knot_points, T *s_xux, T *s_xux_traj, T *s_temp, cooperative_groups::thread_group g = cooperative_groups::this_thread_block()){
133 | 		
134 | 
135 | 		const uint32_t threadsNeeded = state_size + control_size * (blockIdx.x != knot_points - 1);
136 | 		const T Q_cost = COST_Q1<T>();
137 | 		const T QD_cost = COST_QD<T>();
138 | 		const T R_cost = COST_R<T>();
139 | 
140 | 		T err, val;
141 | 
142 | 
143 | 		for(int i = threadIdx.x; i < threadsNeeded; i += blockDim.x){
144 | 			if(i < state_size){
145 | 				if(i < state_size / 2){
146 | 					err = s_xux[i] - s_xux_traj[i];
147 | 					val = Q_cost * err * err;
148 | 				}
149 | 				else{
150 | 
151 | #if ABSOLUTE_QD_PENALTY
152 | 					err = s_xux[i];
153 | #else
154 | 					err = s_xux[i] - s_xux_traj[i];
155 | #endif
156 | 					val = QD_cost * err * err;
157 | 				}
158 | 				
159 | 			}
160 | 			else{
161 | 				err = s_xux[i];
162 | 				val = R_cost * err * err;
163 | 			}
164 | 			s_temp[i] = static_cast<T>(0.5) * val;
165 | 		}
166 | 
167 | 		g.sync();
168 | 		glass::reduce<T>(threadsNeeded, s_temp);
169 | 		g.sync();
170 | 		return s_temp[0];
171 | 	}
172 | 
173 | 
174 | 	///TODO: costgradientandhessian could be much faster with no divergence
175 | 	// not last block
176 | 	template <typename T>
177 | 	__device__
178 | 	void trackingCostGradientAndHessian(uint32_t state_size, 
179 | 										uint32_t control_size, 
180 | 										T *s_xu, 
181 | 										T *s_xu_traj, 
182 | 										T *s_Qk, 
183 | 										T *s_qk, 
184 | 										T *s_Rk, 
185 | 										T *s_rk,
186 | 										uint32_t block_id, 
187 | 										cooperative_groups::thread_group g)
188 | 	{	
189 | 		const uint32_t threadsNeeded = state_size + control_size;
190 | 		const T Q_cost = COST_Q1<T>();
191 | 		const T QD_cost = COST_QD<T>();
192 | 		const T R_cost = COST_R<T>();
193 | 
194 | 		uint32_t offset;
195 | 		T err;
196 | 
197 | 		for (int i = g.thread_rank(); i < threadsNeeded; i += g.size()){
198 | 
199 | 			
200 | 			
201 | 			if(i < state_size){
202 | 				//gradient
203 | 				if (i < state_size / 2){
204 | 					err = s_xu[i] - s_xu_traj[i];
205 | 					s_qk[i] = Q_cost * err;
206 | 				}
207 | 				else{
208 | #if ABSOLUTE_QD_PENALTY
209 | 					err = s_xu[i];
210 | #else
211 | 					err = s_xu[i] - s_xu_traj[i];
212 | #endif
213 | 					s_qk[i] = QD_cost * err;
214 | 				}
215 | 				
216 | 				//hessian
217 | 				for(int j = 0; j < state_size; j++){
218 | 					if(j < state_size / 2){
219 | 						s_Qk[i*state_size + j] = (i == j) ? Q_cost : static_cast<T>(0);
220 | 					}
221 | 					else{
222 | 						s_Qk[i*state_size + j] = (i == j) ? QD_cost : static_cast<T>(0);
223 | 					}
224 | 				}
225 | 			}
226 | 			else{
227 | 
228 | 				err = s_xu[i];
229 | 				offset = i - state_size;
230 | 				
231 | 				//gradient
232 | 				s_rk[offset] = R_cost * err;
233 | 				
234 | 				//hessian
235 | 				for(int j = 0; j < control_size; j++){
236 | 					s_Rk[offset*control_size+j] = (offset == j) ? R_cost : static_cast<T>(0);
237 | 				}
238 | 			}
239 | 		}
240 | 	}
241 | 
242 | 	// last block
243 | 	template <typename T>
244 | 	__device__
245 | 	void trackingCostGradientAndHessian_lastblock(uint32_t state_size, 
246 | 							    				  uint32_t control_size, 
247 | 							    				  T *s_xux, 
248 | 							    				  T *s_xux_traj, 
249 | 							    				  T *s_Qk, 
250 | 							    				  T *s_qk, 
251 | 							    				  T *s_Rk, 
252 | 							    				  T *s_rk, 
253 | 							    				  T *s_Qkp1, 
254 | 							    				  T *s_qkp1,
255 | 							    				  uint32_t block_id, 
256 | 							    				  cooperative_groups::thread_group g)
257 | 	{
258 | 		unsigned threadsNeeded = 2*state_size + control_size;
259 | 		const T Q_cost = COST_Q1<T>();
260 | 		const T QD_cost = COST_QD<T>();
261 | 		const T R_cost = COST_R<T>();
262 | 
263 | 		T err;
264 | 		uint32_t offset;
265 | 
266 | 		for (int i = g.thread_rank(); i < threadsNeeded; i += g.size()){
267 | 
268 | 			if (i < state_size){
269 | 				if(i < state_size / 2){
270 | 					err = s_xux[i] - s_xux_traj[i];
271 | 					s_qk[i] = Q_cost * err;
272 | 				}
273 | 				else{
274 | #if ABSOLUTE_QD_PENALTY
275 | 					err = s_xux[i];
276 | #else
277 | 					err = s_xux[i] - s_xux_traj[i];
278 | #endif
279 | 					s_qk[i] = QD_cost * err;
280 | 				}
281 | 				
282 | 				for(int j = 0; j < state_size; j++){
283 | 					if(j < state_size / 2){
284 | 						s_Qk[i*state_size + j] = (i == j) ? Q_cost : static_cast<T>(0);
285 | 					}
286 | 					else{
287 | 						s_Qk[i*state_size + j] = (i == j) ? QD_cost : static_cast<T>(0);
288 | 					}
289 | 				}
290 | 			}
291 | 			else if(i < state_size + control_size){
292 | 				err = s_xux[i];
293 | 				offset = i - state_size;
294 | 				s_rk[offset] = R_cost * err;
295 | 
296 | 				for(int j = 0; j < control_size; j++){
297 | 					s_Rk[offset*control_size + j] = (offset == j) ? R_cost : static_cast<T>(0);
298 | 				}
299 | 			}
300 | 			else{
301 | 				offset = i - state_size - control_size;
302 | 				if(offset < state_size / 2){
303 | 					err = s_xux[i] - s_xux_traj[i];
304 | 					s_qkp1[offset] = Q_cost * err;
305 | 				}
306 | 				else{
307 | #if ABSOLUTE_QD_PENALTY
308 | 					err = s_xux[i];
309 | #else
310 | 					err = s_xux[i] - s_xux_traj[i];
311 | #endif
312 | 					s_qkp1[offset] = QD_cost * err;
313 | 				}
314 | 
315 | 
316 | 				for(int j = 0; j < state_size; j++){
317 | 					if(j < state_size / 2){
318 | 						s_Qkp1[offset*state_size+j] = (offset == j) ? Q_cost : static_cast<T>(0);
319 | 					}
320 | 					else{
321 | 						s_Qkp1[offset*state_size+j] = (offset == j) ? QD_cost : static_cast<T>(0);
322 | 					}
323 | 				}
324 | 
325 | 			}
326 | 		}
327 | 	}
328 | 
329 | 	__host__ __device__
330 | 	constexpr unsigned costGradientAndHessian_TempMemSize_Shared(){return 0;}
331 | }
332 | 
333 | 


--------------------------------------------------------------------------------
/include/dynamics/rbd_plant.cuh:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | // #include "iiwa_plant.cuh"
4 | 
5 | #include "iiwa/iiwa_eepos_plant.cuh"


--------------------------------------------------------------------------------
/include/mpcsim.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <iomanip>
  3 | #include <fstream>
  4 | #include <iostream>
  5 | #include <vector>
  6 | #include <numeric>
  7 | #include <algorithm>
  8 | #include <cstdint>
  9 | #include <cublas_v2.h>
 10 | #include <math.h>
 11 | #include <cmath>
 12 | #include <random>
 13 | #include <cuda_runtime.h>
 14 | #include <tuple>
 15 | #include <time.h>
 16 | #include "integrator.cuh"
 17 | #include "settings.cuh"
 18 | #include "utils/experiment.cuh"
 19 | #include "gpuassert.cuh"
 20 | 
 21 | #if LINSYS_SOLVE == 1
 22 | #include "pcg/sqp.cuh"
 23 | #else 
 24 | #include "qdldl/sqp.cuh"
 25 | #endif
 26 | 
 27 | 
 28 | 
 29 | template <typename T>
 30 | __global__
 31 | void compute_tracking_error_kernel(T *d_tracking_error, uint32_t state_size, T *d_xu_goal, T *d_xs){
 32 |     
 33 |     T err;
 34 |     
 35 |     for(int ind = threadIdx.x; ind < state_size/2; ind += blockDim.x){
 36 |         err = abs(d_xs[ind] - d_xu_goal[ind]);
 37 |         atomicAdd(d_tracking_error, err);
 38 |     }
 39 | }
 40 | 
 41 | 
 42 | template <typename T>
 43 | T compute_tracking_error(uint32_t state_size, T *d_xu_goal, T *d_xs){
 44 | 
 45 |     T h_tracking_error = 0.0f;
 46 |     T *d_tracking_error;
 47 |     gpuErrchk(cudaMalloc(&d_tracking_error, sizeof(T)));
 48 |     gpuErrchk(cudaMemcpy(d_tracking_error, &h_tracking_error, sizeof(T), cudaMemcpyHostToDevice));
 49 | 
 50 |     compute_tracking_error_kernel<T><<<1,32>>>(d_tracking_error, state_size, d_xu_goal, d_xs);
 51 | 
 52 |     gpuErrchk(cudaMemcpy(&h_tracking_error, d_tracking_error, sizeof(T), cudaMemcpyDeviceToHost));
 53 |     gpuErrchk(cudaFree(d_tracking_error));
 54 |     return h_tracking_error;
 55 | }
 56 | 
 57 | 
 58 | template <typename T>
 59 | void dump_tracking_data(std::vector<int> *pcg_iters, std::vector<bool> *pcg_exits, std::vector<double> *linsys_times, std::vector<double> *sqp_times, std::vector<uint32_t> *sqp_iters, 
 60 |                 std::vector<bool> *sqp_exits, std::vector<T> *tracking_errors, std::vector<std::vector<T>> *tracking_path, uint32_t timesteps_taken, 
 61 |                 uint32_t control_updates_taken, uint32_t start_state_ind, uint32_t goal_state_ind, uint32_t test_iter,
 62 |                 std::string filename_prefix){
 63 |     // Helper function to create file names
 64 |     auto createFileName = [&](const std::string& data_type) {
 65 |         std::string filename = filename_prefix + "_" + std::to_string(test_iter) + "_" + data_type + ".result";
 66 |         return filename;
 67 |     };
 68 |     
 69 |     // Helper function to dump single-dimension vector data
 70 |     auto dumpVectorData = [&](const auto& data, const std::string& data_type) {
 71 |         std::ofstream file(createFileName(data_type));
 72 |         if (!file.is_open()) {
 73 |             std::cerr << "Failed to open " << data_type << " file.\n";
 74 |             return;
 75 |         }
 76 |         for (const auto& item : *data) {
 77 |             file << item << '\n';
 78 |         }
 79 |         file.close();
 80 |     };
 81 | 
 82 |     // Dump single-dimension vector data
 83 |     dumpVectorData(pcg_iters, "pcg_iters");
 84 |     dumpVectorData(linsys_times, "linsys_times");
 85 |     dumpVectorData(sqp_times, "sqp_times");
 86 |     dumpVectorData(sqp_iters, "sqp_iters");
 87 |     dumpVectorData(sqp_exits, "sqp_exits");
 88 |     dumpVectorData(tracking_errors, "tracking_errors");
 89 |     dumpVectorData(pcg_exits, "pcg_exits");
 90 | 
 91 | 
 92 |     // Dump two-dimension vector data (tracking_path)
 93 |     std::ofstream file(createFileName("tracking_path"));
 94 |     if (!file.is_open()) {
 95 |         std::cerr << "Failed to open tracking_path file.\n";
 96 |         return;
 97 |     }
 98 |     for (const auto& outerItem : *tracking_path) {
 99 |         for (const auto& innerItem : outerItem) {
100 |             file << innerItem << ',';
101 |         }
102 |         file << '\n';
103 |     }
104 |     file.close();
105 | 
106 |     std::ofstream statsfile(createFileName("stats"));
107 |     if (!statsfile.is_open()) {
108 |         std::cerr << "Failed to open stats file.\n";
109 |         return;
110 |     }
111 |     statsfile << "timesteps: " << timesteps_taken << "\n";
112 |     statsfile << "control_updates: " << control_updates_taken << "\n";
113 |     // printStatsToFile<double>(&linsys_times, )
114 |     
115 |     statsfile.close();
116 | }
117 | 
118 | 
119 | void print_test_config(){
120 |     std::cout << "Knot points: " << KNOT_POINTS << "\n";
121 |     std::cout << "State size: " << STATE_SIZE << "\n";
122 |     std::cout << "Datatype: " << (USE_DOUBLES ? "DOUBLE" : "FLOAT") << "\n";
123 |     std::cout << "Sqp exits condition: " << (CONST_UPDATE_FREQ ? "CONSTANT TIME" : "CONSTANT ITERS") << "\n";
124 |     std::cout << "QD COST: " << QD_COST << "\n";
125 |     std::cout << "R COST: " << R_COST << "\n";
126 |     std::cout << "Rho factor: " << RHO_FACTOR << "\n";
127 |     std::cout << "Rho max: " << RHO_MAX << "\n";
128 |     std::cout << "Test iters: " << TEST_ITERS << "\n";
129 | #if CONST_UPDATE_FREQ
130 |     std::cout << "Max sqp time: " << SQP_MAX_TIME_US << "\n";
131 | #else
132 |     std::cout << "Max sqp iter: " << SQP_MAX_ITER << "\n";
133 | #endif
134 |     std::cout << "Solver: " << ( (LINSYS_SOLVE == 1) ? "PCG" : "QDLDL") << "\n";
135 | #if LINSYS_SOLVE == 1
136 |     std::cout << "Max pcg iter: " << PCG_MAX_ITER << "\n";
137 |     // std::cout << "pcg exit tol: " << PCG_EXIT_TOL << "\n";
138 | #endif
139 |     std::cout << "Save data: " << (SAVE_DATA ? "ON" : "OFF") << "\n";
140 |     std::cout << "Jitters: " << (REMOVE_JITTERS ? "ON" : "OFF") << "\n";
141 | 
142 |     std::cout << "\n\n";
143 | }
144 | 
145 | 
146 | template <typename T, typename return_type>
147 | std::tuple<std::vector<toplevel_return_type>, std::vector<linsys_t>, linsys_t> simulateMPC(const uint32_t state_size, const uint32_t control_size, const uint32_t knot_points, const uint32_t traj_steps, 
148 |             float timestep, T *d_eePos_traj, T *d_xu_traj, T *d_xs, uint32_t start_state_ind, uint32_t goal_state_ind, uint32_t test_iter, T linsys_exit_tol,
149 |             std::string test_output_prefix){
150 | 
151 |     const uint32_t traj_len = (state_size+control_size)*knot_points-control_size;
152 | 
153 |     const T shift_threshold = SHIFT_THRESHOLD;
154 |     const int max_control_updates = 100000;
155 |     
156 |     
157 |     // struct timespec solve_start, solve_end;
158 |     double sqp_solve_time_us = 0;               // current sqp solve time
159 |     double simulation_time = 0;                 // current simulation time
160 |     double prev_simulation_time = 0;            // last simulation time
161 |     double time_since_timestep = 0;             // time since last timestep of original trajectory
162 |     bool shifted = false;                       // has xu been shifted
163 |     uint32_t traj_offset = 0;                        // current goal states of original trajectory
164 | 
165 | 
166 |     // vars for recording data
167 |     std::vector<std::vector<T>> tracking_path;      // list of traversed traj
168 |     std::vector<int> linsys_iters;
169 |     std::vector<double> linsys_times;
170 |     std::vector<double> sqp_times;
171 |     std::vector<uint32_t> sqp_iters;
172 |     std::vector<bool> sqp_exits;
173 |     std::vector<bool> linsys_exits;
174 |     std::vector<T> tracking_errors;
175 |     std::vector<int> cur_linsys_iters;
176 |     std::vector<bool> cur_linsys_exits;
177 |     std::vector<double> cur_linsys_times;
178 |     std::tuple<std::vector<int>, std::vector<double>, double, uint32_t, bool, std::vector<bool>> sqp_stats;
179 |     uint32_t cur_sqp_iters;
180 |     T cur_tracking_error;
181 |     int control_update_step;
182 | 
183 | 
184 |     // mpc iterates
185 |     T *d_lambda, *d_eePos_goal, *d_xu, *d_xu_old;
186 |     gpuErrchk(cudaMalloc(&d_lambda, state_size*knot_points*sizeof(T)));
187 |     gpuErrchk(cudaMalloc(&d_xu, traj_len*sizeof(T)));
188 |     gpuErrchk(cudaMalloc(&d_xu_old, traj_len*sizeof(T)));
189 |     gpuErrchk(cudaMalloc(&d_eePos_goal, 6*knot_points*sizeof(T)));
190 |     gpuErrchk(cudaMemset(d_lambda, 0, state_size*knot_points*sizeof(T)));
191 |     gpuErrchk(cudaMemcpy(d_eePos_goal, d_eePos_traj, 6*knot_points*sizeof(T), cudaMemcpyDeviceToDevice));
192 |     gpuErrchk(cudaMemcpy(d_xu_old, d_xu_traj, traj_len*sizeof(T), cudaMemcpyDeviceToDevice));
193 |     gpuErrchk(cudaMemcpy(d_xu, d_xu_traj, traj_len*sizeof(T), cudaMemcpyDeviceToDevice));
194 | 
195 | 
196 |     void *d_dynmem = gato_plant::initializeDynamicsConstMem<T>();
197 | 
198 | 
199 |     // temp host memory
200 |     T h_xs[state_size];
201 |     gpuErrchk(cudaMemcpy(h_xs, d_xs, state_size*sizeof(T), cudaMemcpyDeviceToHost));
202 |     tracking_path.push_back(std::vector<T>(h_xs, &h_xs[state_size]));    
203 |     gpuErrchk(cudaPeekAtLastError());
204 |     T h_eePos[6];
205 |     T h_eePos_goal[6];
206 | 
207 | 
208 |     // temp device memory
209 |     T *d_eePos;
210 |     gpuErrchk(cudaMalloc(&d_eePos, 6*sizeof(T)));
211 | 
212 | #if LINSYS_SOLVE == 1
213 |     pcg_config<T> config;
214 |     config.pcg_block = PCG_NUM_THREADS;
215 |     config.pcg_exit_tol = linsys_exit_tol;
216 |     config.pcg_max_iter = PCG_MAX_ITER;
217 | #endif
218 | 
219 |     T rho = 1e-3;
220 |     T rho_reset = 1e-3;
221 | 
222 | #if REMOVE_JITTERS
223 | 	#if LINSYS_SOLVE == 1
224 |     config.pcg_exit_tol = 1e-11;
225 |     config.pcg_max_iter = 10000;
226 |     
227 |     for(int j = 0; j < 100; j++){
228 |         sqpSolvePcg<T>(state_size, control_size, knot_points, timestep, d_eePos_goal, d_lambda, d_xu, d_dynmem, config, rho, 1e-3);
229 |         gpuErrchk(cudaMemcpy(d_xu, d_xu_traj, traj_len*sizeof(T), cudaMemcpyDeviceToDevice));
230 |     }
231 |     rho = 1e-3;
232 |     config.pcg_exit_tol = linsys_exit_tol;
233 |     config.pcg_max_iter = PCG_MAX_ITER;
234 | 	#else
235 |     for(int j = 0; j < 100; j++){
236 |         sqpSolveQdldl<T>(state_size, control_size, knot_points, timestep, d_eePos_goal, d_lambda, d_xu, d_dynmem, rho, 1e-3);
237 |         gpuErrchk(cudaMemcpy(d_xu, d_xu_traj, traj_len*sizeof(T), cudaMemcpyDeviceToDevice));
238 |     }
239 |     rho = 1e-3;
240 | 	#endif
241 | 
242 | #endif // #if REMOVE_JITTERS
243 | 
244 | 
245 | 
246 |     //
247 |     // MPC tracking loop
248 |     //
249 |     for(control_update_step = 0; control_update_step < max_control_updates; control_update_step++){
250 |         
251 | 
252 |         if (traj_offset == traj_steps){ break; }
253 | 
254 | 
255 | 
256 | #if LIVE_PRINT_PATH
257 |         grid::end_effector_positions_kernel<T><<<1,128,144*sizeof(T)>>>(d_eePos, d_xs, grid::NUM_JOINTS, (grid::robotModel<T> *) d_dynmem, 1);
258 |         gpuErrchk(cudaMemcpy(h_eePos, d_eePos, 6*sizeof(T), cudaMemcpyDeviceToHost));
259 |         for (uint32_t i = 0; i < 6; i++){
260 |             std::cout << h_eePos[i] << (i < 5 ? " " : "\n");
261 |         }
262 | #endif // #if LIVE_PRINT_PATH
263 |         
264 | 
265 | 
266 | #if LINSYS_SOLVE == 1
267 |         sqp_stats = sqpSolvePcg<T>(state_size, control_size, knot_points, timestep, d_eePos_goal, d_lambda, d_xu, d_dynmem, config, rho, rho_reset);
268 | #else 
269 | 	    sqp_stats = sqpSolveQdldl<T>(state_size, control_size, knot_points, timestep, d_eePos_goal, d_lambda, d_xu, d_dynmem, rho, rho_reset);
270 | #endif
271 | 
272 |         cur_linsys_iters = std::get<0>(sqp_stats);
273 |         cur_linsys_times = std::get<1>(sqp_stats);
274 |         sqp_solve_time_us = std::get<2>(sqp_stats);
275 |         cur_sqp_iters = std::get<3>(sqp_stats);
276 |         sqp_exits.push_back(std::get<4>(sqp_stats));
277 |         cur_linsys_exits = std::get<5>(sqp_stats);
278 | 
279 | 
280 | #if CONST_UPDATE_FREQ
281 |         simulation_time = SIMULATION_PERIOD;
282 | #else
283 |         simulation_time = sqp_solve_time_us;
284 | #endif
285 |         
286 | 
287 |         // simulate traj for current solve time, offset by previous solve time
288 |         simple_simulate<T>(state_size, control_size, knot_points, d_xs, d_xu_old, d_dynmem, timestep, prev_simulation_time, simulation_time);
289 | 
290 |         // old xu = new xu
291 |         gpuErrchk(cudaMemcpy(d_xu_old, d_xu, traj_len*sizeof(T), cudaMemcpyDeviceToDevice));
292 | 
293 | 
294 |         time_since_timestep += simulation_time * 1e-6;
295 | 
296 |         // if shift_threshold% through timestep
297 |         if (!shifted && time_since_timestep > shift_threshold){
298 |             
299 |             // record tracking error
300 |             grid::end_effector_positions_kernel<T><<<1,128,144*sizeof(T)>>>(d_eePos, d_xs, grid::NUM_JOINTS, (grid::robotModel<T> *) d_dynmem, 1);
301 |             gpuErrchk(cudaMemcpy(h_eePos, d_eePos, 6*sizeof(T), cudaMemcpyDeviceToHost));
302 |             gpuErrchk(cudaMemcpy(h_eePos_goal, d_eePos_goal, 6*sizeof(T), cudaMemcpyDeviceToHost));
303 |             cur_tracking_error = 0.0;
304 |             for(uint32_t i=0; i < 3; i++){
305 |                 cur_tracking_error += abs(h_eePos[i] - h_eePos_goal[i]);
306 |             }
307 |             // std::cout << cur_tracking_error << std::endl;;
308 |             tracking_errors.push_back(cur_tracking_error);                                            
309 |             
310 |             traj_offset++;
311 | 
312 |             // shift xu
313 |             just_shift<T>(state_size, control_size, knot_points, d_xu);             // shift everything over one
314 |             if (traj_offset + knot_points < traj_steps){
315 |                 // if within precomputed traj, fill in last state, control with precompute
316 |                 gpuErrchk(cudaMemcpy(&d_xu[traj_len - (state_size + control_size)], &d_xu_traj[(state_size+control_size)*traj_offset - control_size], sizeof(T)*(state_size+control_size), cudaMemcpyDeviceToDevice));     // last state filled from precomputed trajectory
317 |             }
318 |             else{
319 |                 // fill in last state with goal position, zero velocity, last control with zero control
320 |                 gpuErrchk(cudaMemcpy(&d_xu[traj_len - state_size], &d_xu_traj[(traj_steps-1)*(state_size+control_size)], (state_size/2)*sizeof(T), cudaMemcpyDeviceToDevice));
321 |                 gpuErrchk(cudaMemset(&d_xu[traj_len - state_size / 2], 0, (state_size/2) * sizeof(T)));
322 |                 gpuErrchk(cudaMemset(&d_xu[traj_len - (state_size+control_size)], 0, control_size * sizeof(T)));
323 |             }
324 |             
325 |             // shift goal
326 |             just_shift(6, 0, knot_points, d_eePos_goal);
327 |             if (traj_offset + knot_points < traj_steps){
328 |                 gpuErrchk(cudaMemcpy(&d_eePos_goal[(knot_points-1)*(6)], &d_eePos_traj[(traj_offset+knot_points-1) * (6)], 6*sizeof(T), cudaMemcpyDeviceToDevice));
329 |             }
330 |             else{
331 |                 // fill in last goal state with goal state and zero velocity
332 |                 gpuErrchk(cudaMemcpy(&d_eePos_goal[(knot_points-1)*(6)], &d_eePos_traj[(traj_steps-1)*(6)], (6)*sizeof(T), cudaMemcpyDeviceToDevice));
333 |                 // gpuErrchk(cudaMemset(&d_eePos_goal[(knot_points-1)*(6) + state_size / 2], 0, (state_size/2) * sizeof(T)));
334 |             }
335 |             
336 |             // shift lambda
337 |             just_shift(state_size, 0, knot_points, d_lambda);
338 |                 // gpuErrchk(cudaMemset(&lambdas[i][state_size*(knot_points-1)], 0, state_size*sizeof(T)));
339 |             
340 |             shifted = true;
341 |         }
342 | 
343 |         if (time_since_timestep > timestep){
344 |             // std::cout << "shifted to offset: " << traj_offset + 1 << std::endl;
345 |             shifted = false;
346 |             time_since_timestep = std::fmod(time_since_timestep, timestep);
347 |         }
348 |         gpuErrchk(cudaMemcpy(d_xu, d_xs, state_size*sizeof(T), cudaMemcpyDeviceToDevice));
349 | 
350 | 
351 |         
352 |         prev_simulation_time = simulation_time;
353 | 
354 |         gpuErrchk(cudaPeekAtLastError());
355 | 
356 |         
357 |         // record data
358 |         linsys_iters.insert(linsys_iters.end(), cur_linsys_iters.begin(), cur_linsys_iters.end());                      // linsys iters
359 |         linsys_times.insert(linsys_times.end(), cur_linsys_times.begin(), cur_linsys_times.end());          // linsys times
360 |         linsys_exits.insert(linsys_exits.end(), cur_linsys_exits.begin(), cur_linsys_exits.end());
361 |         gpuErrchk(cudaMemcpy(h_xs, d_xs, state_size*sizeof(T), cudaMemcpyDeviceToHost));
362 |         tracking_path.push_back(std::vector<T>(h_xs, &h_xs[state_size]));                                   // next state
363 |         sqp_times.push_back(sqp_solve_time_us);
364 |         sqp_iters.push_back(cur_sqp_iters);
365 | 
366 | 
367 | #if LIVE_PRINT_STATS
368 |         if (control_update_step % 1000 == 50){
369 |             for (uint32_t i = 0; i < state_size; i++){
370 |                 std::cout << h_xs[i] << (i < state_size-1 ? " " : "\n");
371 |             }
372 |     #if TIME_LINSYS == 1
373 |             std::cout << "linear system solve time:" << std::endl;
374 |             printStats<double>(&linsys_times);
375 |     #endif // #if TIME_LINSYS
376 |             std::cout << "goal offset [" << traj_offset << "]\n";
377 |             std::cout << "sqp iters" << std::endl;
378 |             printStats<uint32_t>(&sqp_iters);
379 |             std::cout << "sqp times" << std::endl;
380 |             printStats<double>(&sqp_times);
381 |             
382 |             int totalOnes = std::accumulate(linsys_exits.begin(), linsys_exits.end(), 0);
383 |             double max_iter_pct = (static_cast<double>(totalOnes) / linsys_exits.size());
384 |             std::cout << "linsys exits for max iter: " << max_iter_pct * 100 << "% of the time\n";
385 |             if (max_iter_pct > 0.5) {
386 |                std::cout << "WARNING: PCG exiting for max iter over 50% of the time" << std::endl;
387 |             }
388 |             
389 |             std::cout << "avg tracking error: " << std::accumulate(tracking_errors.begin(), tracking_errors.end(), 0.0f) / traj_offset << " current error: " << cur_tracking_error << "\n";
390 |             std::cout << std::endl;
391 | 
392 |         }
393 | 
394 | #endif
395 | 
396 | 
397 |     }
398 | #if SAVE_DATA
399 |     dump_tracking_data(&linsys_iters, &linsys_exits, &linsys_times, &sqp_times, &sqp_iters, &sqp_exits, &tracking_errors, &tracking_path, 
400 |             traj_offset, control_update_step, start_state_ind, goal_state_ind, test_iter, test_output_prefix);
401 | #endif
402 |     
403 | 
404 |     grid::end_effector_positions_kernel<T><<<1,128,144*sizeof(T)>>>(d_eePos, d_xs, grid::NUM_JOINTS, (grid::robotModel<T> *) d_dynmem, 1);
405 |     gpuErrchk(cudaMemcpy(h_eePos, d_eePos, 6*sizeof(T), cudaMemcpyDeviceToHost));
406 |     gpuErrchk(cudaMemcpy(h_eePos_goal, d_eePos_goal, 6*sizeof(T), cudaMemcpyDeviceToHost));
407 |     cur_tracking_error = 0.0;
408 |     for(uint32_t i=0; i < 3; i++){
409 |         cur_tracking_error += abs(h_eePos[i] - h_eePos_goal[i]);
410 |     }
411 | 
412 |     gato_plant::freeDynamicsConstMem<T>(d_dynmem);
413 | 
414 |     gpuErrchk(cudaFree(d_lambda));
415 |     gpuErrchk(cudaFree(d_xu));
416 |     gpuErrchk(cudaFree(d_eePos_goal));
417 |     gpuErrchk(cudaFree(d_xu_old));
418 | 
419 |     gpuErrchk(cudaFree(d_eePos));
420 | 
421 |     #if TIME_LINSYS == 1 
422 |         return std::make_tuple(linsys_times, tracking_errors, cur_tracking_error);
423 |     #else
424 |         return std::make_tuple(sqp_iters, tracking_errors, cur_tracking_error);
425 |     #endif
426 | }
427 | 


--------------------------------------------------------------------------------
/include/pcg/linsys_setup.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <cstdint>
  3 | #include "gpuassert.cuh"
  4 | #include "glass.cuh"
  5 | #include "utils/matrix.cuh"
  6 | 
  7 | 
  8 | 
  9 | template <typename T>
 10 | __device__
 11 | void complete_SS_Pinv_blockrow(uint32_t state_size, uint32_t knot_points, T *d_S, T *d_Pinv, T *d_gamma, T *s_temp, unsigned blockrow){
 12 | 
 13 |     const uint32_t states_sq = state_size*state_size;
 14 |     
 15 |     //  STATE OF DEVICE MEM
 16 |     //  S:      -Q0_i in spot 00, phik left off-diagonal, thetak main diagonal, phik_T right off-diagonal
 17 |     //  Phi:    -Q0 in spot 00, theta_invk main diagonal
 18 |     //  gamma:  -Q0_i*q0 spot 0, gammak
 19 | 
 20 | 
 21 |     // GOAL SPACE ALLOCATION IN SHARED MEM
 22 |     // s_temp  = | phi_k_T | phi_k | phi_kp1 | thetaInv_k | thetaInv_kp1 | thetaInv_km1 | PhiInv_R | PhiInv_L | scratch
 23 |     T *s_phi_k = s_temp;
 24 |     T *s_phi_kp1_T = s_phi_k + states_sq;
 25 |     T *s_thetaInv_k = s_phi_kp1_T + states_sq;
 26 |     T *s_thetaInv_km1 = s_thetaInv_k + states_sq;
 27 |     T *s_thetaInv_kp1 = s_thetaInv_km1 + states_sq;
 28 |     T *s_PhiInv_k_R = s_thetaInv_kp1 + states_sq;
 29 |     T *s_PhiInv_k_L = s_PhiInv_k_R + states_sq;
 30 |     T *s_scratch = s_PhiInv_k_L + states_sq;
 31 | 
 32 |     const unsigned lastrow = knot_points - 1;
 33 | 
 34 |     // load phi_kp1_T
 35 |     if(blockrow!=lastrow){
 36 |         load_block_bd<T>(
 37 |             state_size, knot_points,
 38 |             d_S,                // src
 39 |             s_phi_kp1_T,        // dst
 40 |             0,                  // block column (0, 1, or 2)
 41 |             blockrow+1,          // block row
 42 |             true                // transpose
 43 |         );
 44 |     }
 45 |     
 46 | 
 47 |     // load phi_k
 48 |     if(blockrow!=0){
 49 |         load_block_bd<T>(
 50 |             state_size,
 51 |             knot_points,
 52 |             d_S,
 53 |             s_phi_k,
 54 |             0,
 55 |             blockrow
 56 |         );
 57 |     }
 58 |     
 59 | 
 60 | 
 61 |     // load thetaInv_k
 62 |     load_block_bd<T>(
 63 |         state_size, knot_points,
 64 |         d_Pinv,
 65 |         s_thetaInv_k,
 66 |         1,
 67 |         blockrow
 68 |     );
 69 | 
 70 | 
 71 |     // load thetaInv_km1
 72 |     if(blockrow!=0){
 73 |         load_block_bd<T>(
 74 |             state_size, knot_points,
 75 |             d_Pinv,
 76 |             s_thetaInv_km1,
 77 |             1,
 78 |             blockrow-1
 79 |         );
 80 |     }
 81 | 
 82 | 
 83 |     // load thetaInv_kp1
 84 |     if(blockrow!=lastrow){
 85 |         load_block_bd<T>(
 86 |             state_size, knot_points,
 87 |             d_Pinv,
 88 |             s_thetaInv_kp1,
 89 |             1,
 90 |             blockrow+1
 91 |         );
 92 |     }
 93 |     
 94 | 
 95 |     __syncthreads();//----------------------------------------------------------------
 96 | 
 97 |     if(blockrow!=0){
 98 | 
 99 |         // compute left off diag    
100 |         glass::gemm<T>(state_size, state_size, state_size                           , static_cast<T>(1.0), s_thetaInv_k, s_phi_k, s_scratch);
101 |         __syncthreads();//----------------------------------------------------------------
102 |         glass::gemm<T>(state_size, state_size, state_size, static_cast<T>(1.0), s_scratch, s_thetaInv_km1, s_PhiInv_k_L);
103 |         __syncthreads();//----------------------------------------------------------------
104 | 
105 |         // store left diagonal in Phi
106 |         store_block_bd<T>(
107 |             state_size, knot_points,
108 |             s_PhiInv_k_L, 
109 |             d_Pinv,
110 |             0,
111 |             blockrow,
112 |             -1
113 |         );
114 |         __syncthreads();//----------------------------------------------------------------
115 |     }
116 | 
117 | 
118 |     if(blockrow!=lastrow){
119 | 
120 |         // calculate Phi right diag
121 |         glass::gemm<T>(state_size, state_size, state_size                           , static_cast<T>(1.0), s_thetaInv_k, s_phi_kp1_T, s_scratch);
122 |         __syncthreads();//----------------------------------------------------------------
123 |         glass::gemm<T>(state_size, state_size, state_size, static_cast<T>(1.0), s_scratch, s_thetaInv_kp1, s_PhiInv_k_R);
124 |         __syncthreads();//----------------------------------------------------------------
125 | 
126 |         // store Phi right diag
127 |         store_block_bd<T>(
128 |             state_size, knot_points,
129 |             s_PhiInv_k_R, 
130 |             d_Pinv,
131 |             2,
132 |             blockrow,
133 |             -1
134 |         );
135 | 
136 |     }
137 | }
138 | 
139 | template <typename T>
140 | __device__
141 | void form_S_gamma_and_jacobi_Pinv_blockrow(uint32_t state_size, uint32_t control_size, uint32_t knot_points, T *d_G, T *d_C, T *d_g, T *d_c, T *d_S, T *d_Pinv, T *d_gamma, T rho, T *s_temp, unsigned blockrow){
142 |     
143 |     //  SPACE ALLOCATION IN SHARED MEM
144 |     //  | phi_k | theta_k | thetaInv_k | gamma_k | block-specific...
145 |     //     s^2      s^2         s^2         s
146 |     T *s_phi_k = s_temp; 	                            	    // phi_k        states^2
147 |     T *s_theta_k = s_phi_k + state_size*state_size; 			            // theta_k      states^2
148 |     T *s_thetaInv_k = s_theta_k + state_size*state_size; 			        // thetaInv_k   states^2
149 |     T *s_gamma_k = s_thetaInv_k + state_size*state_size;                       // gamma_k      states
150 |     T *s_end_main = s_gamma_k + state_size;                               
151 | 
152 |     if(blockrow==0){
153 | 
154 |         //  LEADING BLOCK GOAL SHARED MEMORY STATE
155 |         //  ...gamma_k | . | Q_N_I | q_N | . | Q_0_I | q_0 | scatch
156 |         //              s^2   s^2     s   s^2   s^2     s      ? 
157 |     
158 |         T *s_QN = s_end_main;
159 |         T *s_QN_i = s_QN + state_size * state_size;
160 |         T *s_qN = s_QN_i + state_size * state_size;
161 |         T *s_Q0 = s_qN + state_size;
162 |         T *s_Q0_i = s_Q0 + state_size * state_size;
163 |         T *s_q0 = s_Q0_i + state_size * state_size;
164 |         T *s_end = s_q0 + state_size;
165 | 
166 |         // scratch space
167 |         T *s_R_not_needed = s_end;
168 |         T *s_r_not_needed = s_R_not_needed + control_size * control_size;
169 |         T *s_extra_temp = s_r_not_needed + control_size * control_size;
170 | 
171 |         __syncthreads();//----------------------------------------------------------------
172 | 
173 |         glass::copy<T>(state_size*state_size, d_G, s_Q0);
174 |         glass::copy<T>(state_size*state_size, d_G+(knot_points-1)*(state_size*state_size+control_size*control_size), s_QN);
175 |         glass::copy<T>(state_size, d_g, s_q0);
176 |         glass::copy<T>(state_size, d_g+(knot_points-1)*(state_size+control_size), s_qN);
177 | 
178 |         __syncthreads();//----------------------------------------------------------------
179 | 
180 |         add_identity(s_Q0, state_size, rho);
181 |         add_identity(s_QN, state_size, rho);
182 |         // if(PRINT_THREAD){
183 |         //     printf("Q0\n");
184 |         //     printMat<state_size,state_size>(s_Q0,state_size);
185 |         //     printf("q0\n");
186 |         //     printMat<1,state_size>(s_q0,1);
187 |         //     printf("QN\n");
188 |         //     printMat<state_size,state_size>(s_QN,state_size);
189 |         //     printf("qN\n");
190 |         //     printMat<1,state_size>(s_qN,1);
191 |         //     printf("start error\n");
192 |         //     printMat<1,state_size>(s_integrator_error,1);
193 |         //     printf("\n");
194 |         // }
195 |         __syncthreads();//----------------------------------------------------------------
196 |         
197 |         // SHARED MEMORY STATE
198 |         // | Q_N | . | q_N | Q_0 | . | q_0 | scatch
199 |         
200 | 
201 |         // save -Q_0 in PhiInv spot 00
202 |         store_block_bd<T>(
203 |             state_size,
204 |             knot_points,
205 |             s_Q0,                       // src     
206 |             d_Pinv,                   // dst         
207 |             1,                          // col
208 |             blockrow,                    // blockrow
209 |             -1                          //  multiplier
210 |         );
211 |         __syncthreads();//----------------------------------------------------------------
212 | 
213 | 
214 |         // invert Q_N, Q_0
215 |         loadIdentity<T>( state_size,state_size,s_Q0_i, s_QN_i);
216 |         __syncthreads();//----------------------------------------------------------------
217 |         invertMatrix<T>( state_size,state_size,state_size,s_Q0, s_QN, s_extra_temp);
218 |         
219 |         __syncthreads();//----------------------------------------------------------------
220 | 
221 | 
222 |         // if(PRINT_THREAD){
223 |         //     printf("Q0Inv\n");
224 |         //     printMat<state_size,state_size>(s_Q0_i,state_size);
225 |         //     printf("QNInv\n");
226 |         //     printMat<floatstate_size,state_size>(s_QN_i,state_size);
227 |         //     printf("theta\n");
228 |         //     printMat<floatstate_size,state_size>(s_theta_k,state_size);
229 |         //     printf("thetaInv\n");
230 |         //     printMat<floatstate_size,state_size>(s_thetaInv_k,state_size);
231 |         //     printf("\n");
232 |         // }
233 |         __syncthreads();//----------------------------------------------------------------
234 | 
235 |         // SHARED MEMORY STATE
236 |         // | . | Q_N_i | q_N | . | Q_0_i | q_0 | scatch
237 |         
238 | 
239 |         // compute gamma
240 |         mat_vec_prod<T>( state_size, state_size,
241 |             s_Q0_i,                                    
242 |             s_q0,                                       
243 |             s_gamma_k 
244 |         );
245 |         __syncthreads();//----------------------------------------------------------------
246 |         
247 | 
248 |         // save -Q0_i in spot 00 in S
249 |         store_block_bd<T>( state_size, knot_points,
250 |             s_Q0_i,                         // src             
251 |             d_S,                            // dst              
252 |             1,                              // col   
253 |             blockrow,                        // blockrow         
254 |             -1                              //  multiplier   
255 |         );
256 |         __syncthreads();//----------------------------------------------------------------
257 | 
258 | 
259 |         // compute Q0^{-1}q0
260 |         mat_vec_prod<T>( state_size, state_size,
261 |             s_Q0_i,
262 |             s_q0,
263 |             s_Q0
264 |         );
265 |         __syncthreads();//----------------------------------------------------------------
266 | 
267 | 
268 |         // SHARED MEMORY STATE
269 |         // | . | Q_N_i | q_N | Q0^{-1}q0 | Q_0_i | q_0 | scatch
270 | 
271 | 
272 |         // save -Q0^{-1}q0 in spot 0 in gamma
273 |         for(unsigned ind = threadIdx.x; ind < state_size; ind += blockDim.x){
274 |             d_gamma[ind] = -s_Q0[ind];
275 |         }
276 |         __syncthreads();//----------------------------------------------------------------
277 | 
278 |     }
279 |     else{                       // blockrow!=LEAD_BLOCK
280 | 
281 | 
282 |         const unsigned C_set_size = state_size*state_size+state_size*control_size;
283 |         const unsigned G_set_size = state_size*state_size+control_size*control_size;
284 | 
285 |         //  NON-LEADING BLOCK GOAL SHARED MEMORY STATE
286 |         //  ...gamma_k | A_k | B_k | . | Q_k_I | . | Q_k+1_I | . | R_k_I | q_k | q_k+1 | r_k | integrator_error | extra_temp
287 |         //               s^2   s*c  s^2   s^2   s^2    s^2    s^2   s^2     s      s      s          s                <s^2?
288 | 
289 |         T *s_Ak = s_end_main; 								
290 |         T *s_Bk = s_Ak +        state_size*state_size;
291 |         T *s_Qk = s_Bk +        state_size*control_size; 	
292 |         T *s_Qk_i = s_Qk +      state_size*state_size;	
293 |         T *s_Qkp1 = s_Qk_i +    state_size*state_size;
294 |         T *s_Qkp1_i = s_Qkp1 +  state_size*state_size;
295 |         T *s_Rk = s_Qkp1_i +    state_size*state_size;
296 |         T *s_Rk_i = s_Rk +      control_size*control_size;
297 |         T *s_qk = s_Rk_i +      control_size*control_size; 	
298 |         T *s_qkp1 = s_qk +      state_size; 			
299 |         T *s_rk = s_qkp1 +      state_size;
300 |         T *s_end = s_rk +       control_size;
301 |         
302 |         // scratch
303 |         T *s_extra_temp = s_end;
304 |         
305 | 
306 |         // if(PRINT_THREAD){
307 |         //     printf("xk\n");
308 |         //     printMat<float1,state_size>(s_xux,1);
309 |         //     printf("uk\n");
310 |         //     printMat<float1,control_size>(&s_xux[state_size],1);
311 |         //     printf("xkp1\n");
312 |         //     printMat<float1,state_size>(&s_xux[state_size+control_size],1);
313 |         //     printf("\n");
314 |         // }
315 | 
316 |         __syncthreads();//----------------------------------------------------------------
317 | 
318 |         glass::copy<T>(state_size*state_size, d_C+      (blockrow-1)*C_set_size, s_Ak);
319 |         glass::copy<T>(state_size*control_size, d_C+      (blockrow-1)*C_set_size+state_size*state_size, s_Bk);
320 |         glass::copy<T>(state_size*state_size, d_G+      (blockrow-1)*G_set_size, s_Qk);
321 |         glass::copy<T>(state_size*state_size, d_G+    (blockrow*G_set_size), s_Qkp1);
322 |         glass::copy<T>(control_size*control_size, d_G+      ((blockrow-1)*G_set_size+state_size*state_size), s_Rk);
323 |         glass::copy<T>(state_size, d_g+      (blockrow-1)*(state_size+control_size), s_qk);
324 |         glass::copy<T>(state_size, d_g+    (blockrow)*(state_size+control_size), s_qkp1);
325 |         glass::copy<T>(control_size, d_g+      ((blockrow-1)*(state_size+control_size)+state_size), s_rk);
326 | 
327 |         __syncthreads();//----------------------------------------------------------------
328 | 
329 |         add_identity(s_Qk, state_size, rho);
330 |         add_identity(s_Qkp1, state_size, rho);
331 |         add_identity(s_Rk, control_size, rho);
332 | 
333 | #if DEBUG_MODE    
334 |         if(blockIdx.x==1 && threadIdx.x==0){
335 |             printf("Ak\n");
336 |             printMat<state_size,state_size>(s_Ak,state_size);
337 |             printf("Bk\n");
338 |             printMat<state_size,control_size>(s_Bk,state_size);
339 |             printf("Qk\n");
340 |             printMat<state_size,state_size>(s_Qk,state_size);
341 |             printf("Rk\n");
342 |             printMat<control_size,control_size>(s_Rk,control_size);
343 |             printf("qk\n");
344 |             printMat<state_size, 1>(s_qk,1);
345 |             printf("rk\n");
346 |             printMat<control_size, 1>(s_rk,1);
347 |             printf("Qkp1\n");
348 |             printMat<state_size,state_size>(s_Qkp1,state_size);
349 |             printf("qkp1\n");
350 |             printMat<state_size, 1>(s_qkp1,1);
351 |             printf("integrator error\n");
352 |         }
353 |         __syncthreads();//----------------------------------------------------------------
354 | #endif /* #if DEBUG_MODE */
355 |         
356 |         // Invert Q, Qp1, R 
357 |         loadIdentity<T>( state_size,state_size,control_size,
358 |             s_Qk_i, 
359 |             s_Qkp1_i, 
360 |             s_Rk_i
361 |         );
362 |         __syncthreads();//----------------------------------------------------------------
363 |         invertMatrix<T>( state_size,state_size,control_size,state_size,
364 |             s_Qk, 
365 |             s_Qkp1, 
366 |             s_Rk, 
367 |             s_extra_temp
368 |         );
369 |         __syncthreads();//----------------------------------------------------------------
370 | 
371 |         // save Qk_i into G (now Ginv) for calculating dz
372 |         glass::copy<T>(state_size*state_size, s_Qk_i, d_G+(blockrow-1)*G_set_size);
373 | 
374 |         // save Rk_i into G (now Ginv) for calculating dz
375 |         glass::copy<T>(control_size*control_size, s_Rk_i, d_G+(blockrow-1)*G_set_size+state_size*state_size);
376 | 
377 |         if(blockrow==knot_points-1){
378 |             // save Qkp1_i into G (now Ginv) for calculating dz
379 |             glass::copy<T>(state_size*state_size, s_Qkp1_i, d_G+(blockrow)*G_set_size);
380 |         }
381 |         __syncthreads();//----------------------------------------------------------------
382 | 
383 | #if DEBUG_MODE
384 |         if(blockrow==1&&threadIdx.x==0){
385 |             printf("Qk\n");
386 |             printMat< state_size,state_size>(s_Qk_i,state_size);
387 |             printf("RkInv\n");
388 |             printMat<control_size,control_size>(s_Rk_i,control_size);
389 |             printf("Qkp1Inv\n");
390 |             printMat< state_size,state_size>(s_Qkp1_i,state_size);
391 |             printf("\n");
392 |         }
393 |         __syncthreads();//----------------------------------------------------------------
394 | #endif /* #if DEBUG_MODE */
395 | 
396 | 
397 |         // Compute -AQ^{-1} in phi
398 |         glass::gemm<T>(state_size, state_size, state_size, static_cast<T>(1.0), s_Ak, s_Qk_i, s_phi_k);
399 |         // for(int i = threadIdx.x; i < state_size*state_size; i++){
400 |         //     s_phi_k[i] *= -1;
401 |         // }
402 | 
403 |         __syncthreads();//----------------------------------------------------------------
404 | 
405 |         // Compute -BR^{-1} in Qkp1
406 |         glass::gemm<T>(state_size, control_size, control_size, static_cast<T>(1.0), s_Bk, s_Rk_i, s_Qkp1);
407 | 
408 |         __syncthreads();//----------------------------------------------------------------
409 | 
410 |         // compute Q_{k+1}^{-1}q_{k+1} - IntegratorError in gamma
411 |         mat_vec_prod<T>( state_size, state_size,
412 |             s_Qkp1_i,
413 |             s_qkp1,
414 |             s_gamma_k
415 |         );
416 |         for(unsigned i = threadIdx.x; i < state_size; i += blockDim.x){
417 |             s_gamma_k[i] -= d_c[(blockrow*state_size)+i];
418 |         }
419 |         __syncthreads();//----------------------------------------------------------------
420 | 
421 |         // compute -AQ^{-1}q for gamma         temp storage in extra temp
422 |         mat_vec_prod<T>( state_size, state_size,
423 |             s_phi_k,
424 |             s_qk,
425 |             s_extra_temp
426 |         );
427 |         
428 | 
429 |         __syncthreads();//----------------------------------------------------------------
430 |         
431 |         // compute -BR^{-1}r for gamma           temp storage in extra temp + states
432 |         mat_vec_prod<T>( state_size, control_size,
433 |             s_Qkp1,
434 |             s_rk,
435 |             s_extra_temp + state_size
436 |         );
437 | 
438 |         __syncthreads();//----------------------------------------------------------------
439 |         
440 |         // gamma = yeah...
441 |         for(unsigned i = threadIdx.x; i < state_size; i += blockDim.x){
442 |             s_gamma_k[i] += s_extra_temp[state_size + i] + s_extra_temp[i]; 
443 |         }
444 |         __syncthreads();//----------------------------------------------------------------
445 | 
446 |         // compute AQ^{-1}AT   -   Qkp1^{-1} for theta
447 |         glass::gemm<T, true>(
448 |             state_size, 
449 |             state_size, 
450 |             state_size,
451 |             static_cast<T>(1.0), 
452 |             s_phi_k, 
453 |             s_Ak, 
454 |             s_theta_k
455 |         );
456 | 
457 |         __syncthreads();//----------------------------------------------------------------
458 | 
459 | #if DEBUG_MODE
460 |         if(blockrow==1&&threadIdx.x==0){
461 |             printf("this is the A thing\n");
462 |             printMat< state_size, state_size>(s_theta_k, 234);
463 |         }
464 | #endif /* #if DEBUG_MODE */
465 | 
466 |         for(unsigned i = threadIdx.x; i < state_size*state_size; i += blockDim.x){
467 |             s_theta_k[i] += s_Qkp1_i[i];
468 |         }
469 |         
470 |         __syncthreads();//----------------------------------------------------------------
471 | 
472 |         // compute BR^{-1}BT for theta            temp storage in QKp1{-1}
473 |         glass::gemm<T, true>(
474 |             state_size,
475 |             control_size,
476 |             state_size,
477 |             static_cast<T>(1.0),
478 |             s_Qkp1,
479 |             s_Bk,
480 |             s_Qkp1_i
481 |         );
482 | 
483 |         __syncthreads();//----------------------------------------------------------------
484 | 
485 |         for(unsigned i = threadIdx.x; i < state_size*state_size; i += blockDim.x){
486 |             s_theta_k[i] += s_Qkp1_i[i];
487 |         }
488 |         __syncthreads();//----------------------------------------------------------------
489 | 
490 |         // save phi_k into left off-diagonal of S, 
491 |         store_block_bd<T>( state_size, knot_points,
492 |             s_phi_k,                        // src             
493 |             d_S,                            // dst             
494 |             0,                              // col
495 |             blockrow,                        // blockrow    
496 |             -1
497 |         );
498 |         __syncthreads();//----------------------------------------------------------------
499 | 
500 |         // save -s_theta_k main diagonal S
501 |         store_block_bd<T>( state_size, knot_points,
502 |             s_theta_k,                                               
503 |             d_S,                                                 
504 |             1,                                               
505 |             blockrow,
506 |             -1                                             
507 |         );          
508 |         __syncthreads();//----------------------------------------------------------------
509 | 
510 |         // invert theta
511 |         loadIdentity<T>(state_size,s_thetaInv_k);
512 |         __syncthreads();//----------------------------------------------------------------
513 |         invertMatrix<T>(state_size,s_theta_k, s_extra_temp);
514 |         __syncthreads();//----------------------------------------------------------------
515 | 
516 | 
517 |         // save thetaInv_k main diagonal PhiInv
518 |         store_block_bd<T>( state_size, knot_points,
519 |             s_thetaInv_k, 
520 |             d_Pinv,
521 |             1,
522 |             blockrow,
523 |             -1
524 |         );
525 | 
526 |         __syncthreads();//----------------------------------------------------------------
527 | 
528 |         // save gamma_k in gamma
529 |         for(unsigned ind = threadIdx.x; ind < state_size; ind += blockDim.x){
530 |             unsigned offset = (blockrow)*state_size + ind;
531 |             d_gamma[offset] = s_gamma_k[ind]*-1;
532 |         }
533 | 
534 |         __syncthreads();//----------------------------------------------------------------
535 | 
536 |         //transpose phi_k
537 |         loadIdentity<T>(state_size,s_Ak);
538 |         __syncthreads();//----------------------------------------------------------------
539 |         glass::gemm<T, true>(
540 |             state_size, 
541 |             state_size, 
542 |             state_size,
543 |             static_cast<T>(1.0), 
544 |             s_Ak, 
545 |             s_phi_k, 
546 |             s_Qkp1
547 |         );
548 |         __syncthreads();//----------------------------------------------------------------
549 | 
550 |         // save phi_k_T into right off-diagonal of S,
551 |         store_block_bd<T>( state_size, knot_points,
552 |             s_Qkp1,                        // src             
553 |             d_S,                            // dst             
554 |             2,                              // col
555 |             blockrow-1,                      // blockrow    
556 |             -1
557 |         );
558 | 
559 |         __syncthreads();//----------------------------------------------------------------
560 |     }
561 | 
562 | }
563 | 
564 | 
565 | template <typename T>
566 | __global__
567 | void form_S_gamma_Pinv_kernel(
568 |     uint32_t state_size,
569 |     uint32_t control_size,
570 |     uint32_t knot_points,
571 |     T *d_G,
572 |     T *d_C,
573 |     T *d_g,
574 |     T *d_c,
575 |     T *d_S,
576 |     T *d_Pinv, 
577 |     T *d_gamma,
578 |     T rho
579 | ){
580 | 
581 |     extern __shared__ T s_temp[ ];
582 | 
583 |     for(unsigned blockrow=blockIdx.x; blockrow<knot_points; blockrow+=gridDim.x){
584 |         form_S_gamma_and_jacobi_Pinv_blockrow<T>(
585 |             state_size, 
586 |             control_size, 
587 |             knot_points, 
588 |             d_G, 
589 |             d_C, 
590 |             d_g, 
591 |             d_c, 
592 |             d_S, 
593 |             d_Pinv, 
594 |             d_gamma, 
595 |             rho, 
596 |             s_temp, 
597 |             blockrow
598 |         );
599 |     }
600 |     cgrps::this_grid().sync();
601 | 
602 |     for(unsigned blockrow=blockIdx.x; blockrow<knot_points; blockrow+=gridDim.x){
603 |         complete_SS_Pinv_blockrow<T>(
604 |             state_size, knot_points,
605 |             d_S,
606 |             d_Pinv,
607 |             d_gamma,
608 |             s_temp,
609 |             blockrow
610 |         );
611 |     }
612 | }
613 | 
614 | 
615 | /*******************************************************************************
616 |  *                           Interface Functions                                *
617 |  *******************************************************************************/
618 | 
619 | 
620 | template <typename T>
621 | void form_schur_system(
622 |     uint32_t state_size, 
623 |     uint32_t control_size, 
624 |     uint32_t knot_points,
625 |     T *d_G_dense, 
626 |     T *d_C_dense, 
627 |     T *d_g, 
628 |     T *d_c, 
629 |     T *d_S, 
630 |     T *d_Pinv, 
631 |     T *d_gamma,            
632 |     T rho
633 | ){
634 |     const uint32_t s_temp_size = sizeof(T)*(8 * state_size*state_size +
635 |                                             7 * state_size + 
636 |                                             state_size * control_size +
637 |                                             3 * control_size + 
638 |                                             2 * control_size * control_size + 
639 |                                             3);
640 | 
641 |     void *kernel = (void *) form_S_gamma_Pinv_kernel<T>;
642 |     void *args[] = {
643 |         (void *) &state_size,
644 |         (void *) &control_size,
645 |         (void *) &knot_points,
646 |         (void *) &d_G_dense,
647 |         (void *) &d_C_dense,
648 |         (void *) &d_g,
649 |         (void *) &d_c,
650 |         (void *) &d_S,
651 |         (void *) &d_Pinv,
652 |         (void *) &d_gamma,
653 |         (void *) &rho
654 |     };
655 | 
656 |     gpuErrchk(cudaLaunchCooperativeKernel(kernel, knot_points, SCHUR_THREADS, args, s_temp_size));
657 | }


--------------------------------------------------------------------------------
/include/pcg/sqp.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <vector>
  3 | #include <numeric>
  4 | #include <algorithm>
  5 | #include <cstdint>
  6 | #include <cublas_v2.h>
  7 | #include <math.h>
  8 | #include <cmath>
  9 | #include <random>
 10 | #include <iomanip>
 11 | #include <cuda_runtime.h>
 12 | #include <tuple>
 13 | #include <time.h>
 14 | #include "linsys_setup.cuh"
 15 | #include "common/kkt.cuh"
 16 | #include "common/dz.cuh"
 17 | #include "merit.cuh"
 18 | #include "gpu_pcg.cuh"
 19 | #include "settings.cuh"
 20 | 
 21 | template <typename T>
 22 | auto sqpSolvePcg(const uint32_t state_size, const uint32_t control_size, const uint32_t knot_points, float timestep, T *d_eePos_traj, T *d_lambda, T *d_xu, void *d_dynMem_const, pcg_config<T>& config, T &rho, T rho_reset){
 23 |     
 24 |     // data storage
 25 |     std::vector<int> pcg_iter_vec;
 26 |     std::vector<bool> pcg_exit_vec;
 27 |     std::vector<double> linsys_time_vec;
 28 |     bool sqp_time_exit = 1;     // for data recording, not a flag
 29 |     
 30 | 
 31 | 
 32 |     // sqp timing
 33 |     struct timespec sqp_solve_start, sqp_solve_end;
 34 |     gpuErrchk(cudaDeviceSynchronize());
 35 |     clock_gettime(CLOCK_MONOTONIC, &sqp_solve_start);
 36 | 
 37 | 
 38 | 
 39 |     const uint32_t states_sq = state_size*state_size;
 40 |     const uint32_t states_p_controls = state_size * control_size;
 41 |     const uint32_t controls_sq = control_size * control_size;
 42 |     const uint32_t states_s_controls = state_size + control_size;
 43 |     const uint32_t KKT_G_DENSE_SIZE_BYTES = static_cast<uint32_t>(((states_sq+controls_sq)*knot_points-controls_sq)*sizeof(T));
 44 |     const uint32_t KKT_C_DENSE_SIZE_BYTES = static_cast<uint32_t>((states_sq+states_p_controls)*(knot_points-1)*sizeof(T));
 45 |     const uint32_t KKT_g_SIZE_BYTES       = static_cast<uint32_t>(((state_size+control_size)*knot_points-control_size)*sizeof(T));
 46 |     const uint32_t KKT_c_SIZE_BYTES       =   static_cast<uint32_t>((state_size*knot_points)*sizeof(T));     
 47 |     const uint32_t DZ_SIZE_BYTES          =   static_cast<uint32_t>((states_s_controls*knot_points-control_size)*sizeof(T));
 48 | 
 49 | 
 50 |     // line search things
 51 |     const float mu = 10.0f;
 52 |     const uint32_t num_alphas = 8;
 53 |     T h_merit_news[num_alphas];
 54 |     void *ls_merit_kernel = (void *) ls_gato_compute_merit<T>;
 55 |     const size_t merit_smem_size = get_merit_smem_size<T>(state_size, control_size);
 56 |     T h_merit_initial, min_merit;
 57 |     T alphafinal;
 58 |     T delta_merit_iter = 0;
 59 |     T delta_merit_total = 0;
 60 |     uint32_t line_search_step = 0;
 61 | 
 62 | 
 63 |     // streams n cublas init
 64 |     cudaStream_t streams[num_alphas];
 65 |     for(uint32_t str = 0; str < num_alphas; str++){
 66 |         cudaStreamCreate(&streams[str]);
 67 |     }
 68 |     gpuErrchk(cudaPeekAtLastError());
 69 | 
 70 |     cublasHandle_t handle;
 71 |     if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); exit(13); }
 72 |     gpuErrchk(cudaPeekAtLastError());
 73 | 
 74 | 
 75 |     uint32_t sqp_iter = 0;
 76 | 
 77 | 
 78 | 
 79 |     T *d_merit_initial, *d_merit_news, *d_merit_temp,
 80 |           *d_G_dense, *d_C_dense, *d_g, *d_c, *d_Ginv_dense,
 81 |           *d_S, *d_gamma,
 82 |           *d_dz,
 83 |           *d_xs;
 84 | 
 85 |     
 86 |     T drho = 1.0;
 87 |     T rho_factor = RHO_FACTOR;
 88 |     T rho_max = RHO_MAX;
 89 |     T rho_min = RHO_MIN;
 90 | 
 91 |     
 92 | 
 93 | 
 94 |     gpuErrchk(cudaMalloc(&d_G_dense,  KKT_G_DENSE_SIZE_BYTES));
 95 |     gpuErrchk(cudaMalloc(&d_C_dense,  KKT_C_DENSE_SIZE_BYTES));
 96 |     gpuErrchk(cudaMalloc(&d_g,        KKT_g_SIZE_BYTES));
 97 |     gpuErrchk(cudaMalloc(&d_c,        KKT_c_SIZE_BYTES));
 98 |     d_Ginv_dense = d_G_dense;
 99 | 
100 |     gpuErrchk(cudaMalloc(&d_S, 3*states_sq*knot_points*sizeof(T)));
101 |     gpuErrchk(cudaMalloc(&d_gamma, state_size*knot_points*sizeof(T)));
102 |     gpuErrchk(cudaPeekAtLastError());
103 | 
104 |     
105 |     gpuErrchk(cudaMalloc(&d_dz,       DZ_SIZE_BYTES));
106 |     gpuErrchk(cudaMalloc(&d_xs,       state_size*sizeof(T)));
107 |     gpuErrchk(cudaMemcpy(d_xs, d_xu,  state_size*sizeof(T), cudaMemcpyDeviceToDevice));
108 |     gpuErrchk(cudaMalloc(&d_merit_news, 8*sizeof(T)));
109 |     gpuErrchk(cudaMalloc(&d_merit_temp, 8*knot_points*sizeof(T)));
110 |     // pcg iterates
111 | 
112 |     gpuErrchk(cudaMalloc(&d_merit_initial, sizeof(T)));
113 |     gpuErrchk(cudaMemset(d_merit_initial, 0, sizeof(T)));
114 |     
115 | 
116 |     // pcg things
117 |     T *d_Pinv;
118 |     gpuErrchk(cudaMalloc(&d_Pinv, 3*states_sq*knot_points*sizeof(T)));
119 |     
120 |     /*   PCG vars   */
121 |     T  *d_r, *d_p, *d_v_temp, *d_eta_new_temp;// *d_r_tilde, *d_upsilon;
122 |     gpuErrchk(cudaMalloc(&d_r, state_size*knot_points*sizeof(T)));
123 |     gpuErrchk(cudaMalloc(&d_p, state_size*knot_points*sizeof(T)));
124 |     gpuErrchk(cudaMalloc(&d_v_temp, knot_points*sizeof(T)));
125 |     gpuErrchk(cudaMalloc(&d_eta_new_temp, knot_points*sizeof(T)));
126 |     
127 |     
128 |     
129 |     void *pcg_kernel = (void *) pcg<T, STATE_SIZE, KNOT_POINTS>;
130 |     uint32_t pcg_iters;
131 |     uint32_t *d_pcg_iters;
132 |     gpuErrchk(cudaMalloc(&d_pcg_iters, sizeof(uint32_t)));
133 |     bool pcg_exit;
134 |     bool *d_pcg_exit;
135 |     gpuErrchk(cudaMalloc(&d_pcg_exit, sizeof(bool)));
136 |     
137 |     void *pcgKernelArgs[] = {
138 |         (void *)&d_S,
139 |         (void *)&d_Pinv,
140 |         (void *)&d_gamma, 
141 |         (void *)&d_lambda,
142 |         (void *)&d_r,
143 |         (void *)&d_p,
144 |         (void *)&d_v_temp,
145 |         (void *)&d_eta_new_temp,
146 |         (void *)&d_pcg_iters,
147 |         (void *)&d_pcg_exit,
148 |         (void *)&config.pcg_max_iter,
149 |         (void *)&config.pcg_exit_tol
150 |     };
151 |     size_t ppcg_kernel_smem_size = pcgSharedMemSize<T>(state_size, knot_points);
152 | 
153 | 
154 |     gpuErrchk(cudaPeekAtLastError());
155 |     gpuErrchk(cudaDeviceSynchronize());
156 | 
157 | #if TIME_LINSYS
158 |     struct timespec linsys_start, linsys_end;
159 |     double linsys_time;
160 | #endif
161 | #if CONST_UPDATE_FREQ
162 |     struct timespec sqp_cur;
163 |     auto sqpTimecheck = [&]() {
164 |         clock_gettime(CLOCK_MONOTONIC, &sqp_cur);
165 |         return time_delta_us_timespec(sqp_solve_start,sqp_cur) > SQP_MAX_TIME_US;
166 |     };
167 | #else
168 |     auto sqpTimecheck = [&]() { return false; };
169 | #endif
170 | 
171 | 
172 |     ///TODO: atomic race conditions here aren't fixed but don't seem to be problematic
173 |     compute_merit<T><<<knot_points, MERIT_THREADS, merit_smem_size>>>(
174 |         state_size, control_size, knot_points,
175 |         d_xu, 
176 |         d_eePos_traj, 
177 |         static_cast<T>(10), 
178 |         timestep, 
179 |         d_dynMem_const, 
180 |         d_merit_initial
181 |     );
182 |     gpuErrchk(cudaMemcpyAsync(&h_merit_initial, d_merit_initial, sizeof(T), cudaMemcpyDeviceToHost));
183 |     gpuErrchk(cudaPeekAtLastError());
184 | 
185 |     //
186 |     //      SQP LOOP
187 |     //
188 |     for(uint32_t sqpiter = 0; sqpiter < SQP_MAX_ITER; sqpiter++){
189 |         
190 |         generate_kkt_submatrices<T><<<knot_points, KKT_THREADS, 2 * get_kkt_smem_size<T>(state_size, control_size)>>>(
191 |             state_size,
192 |             control_size,
193 |             knot_points,
194 |             d_G_dense, 
195 |             d_C_dense, 
196 |             d_g, 
197 |             d_c,
198 |             d_dynMem_const,
199 |             timestep,
200 |             d_eePos_traj,
201 |             d_xs,
202 |             d_xu
203 |         );
204 |         gpuErrchk(cudaPeekAtLastError());
205 |         if (sqpTimecheck()){ break; }
206 | 
207 |         form_schur_system<T>(
208 |             state_size, 
209 |             control_size, 
210 |             knot_points, 
211 |             d_G_dense, 
212 |             d_C_dense, 
213 |             d_g, 
214 |             d_c,
215 |             d_S, 
216 |             d_Pinv, 
217 |             d_gamma,
218 |             rho
219 |         );
220 |         gpuErrchk(cudaPeekAtLastError());
221 |         if (sqpTimecheck()){ break; }
222 |         
223 | 
224 |     #if TIME_LINSYS    
225 |         gpuErrchk(cudaDeviceSynchronize());
226 |         if (sqpTimecheck()){ break; }
227 |         clock_gettime(CLOCK_MONOTONIC,&linsys_start);
228 |     #endif // #if TIME_LINSYS
229 | 
230 |         gpuErrchk(cudaLaunchCooperativeKernel(pcg_kernel, knot_points, PCG_NUM_THREADS, pcgKernelArgs, ppcg_kernel_smem_size));    
231 |         gpuErrchk(cudaMemcpy(&pcg_iters, d_pcg_iters, sizeof(uint32_t), cudaMemcpyDeviceToHost));
232 |         gpuErrchk(cudaMemcpy(&pcg_exit, d_pcg_exit, sizeof(bool), cudaMemcpyDeviceToHost));
233 |         gpuErrchk(cudaPeekAtLastError());
234 | 
235 |     #if TIME_LINSYS
236 |         gpuErrchk(cudaDeviceSynchronize());
237 |         clock_gettime(CLOCK_MONOTONIC,&linsys_end);
238 |         
239 |         linsys_time = time_delta_us_timespec(linsys_start,linsys_end);
240 |         linsys_time_vec.push_back(linsys_time);
241 |     #endif // #if TIME_LINSYS
242 | 
243 |         pcg_iter_vec.push_back(pcg_iters);
244 |         pcg_exit_vec.push_back(pcg_exit);
245 | 
246 |         
247 |         if (sqpTimecheck()){ break; }
248 |         
249 |         // recover dz
250 |         compute_dz(
251 |             state_size,
252 |             control_size,
253 |             knot_points,
254 |             d_Ginv_dense, 
255 |             d_C_dense, 
256 |             d_g, 
257 |             d_lambda, 
258 |             d_dz
259 |         );
260 |         gpuErrchk(cudaPeekAtLastError());
261 |         if (sqpTimecheck()){ break; }
262 |         
263 | 
264 |         // line search
265 |         for(uint32_t p = 0; p < num_alphas; p++){
266 |             void *kernelArgs[] = {
267 |                 (void *)&state_size,
268 |                 (void *)&control_size,
269 |                 (void *)&knot_points,
270 |                 (void *)&d_xs,
271 |                 (void *)&d_xu,
272 |                 (void *)&d_eePos_traj,
273 |                 (void *)&mu, 
274 |                 (void *)&timestep,
275 |                 (void *)&d_dynMem_const,
276 |                 (void *)&d_dz,
277 |                 (void *)&p,
278 |                 (void *)&d_merit_news,
279 |                 (void *)&d_merit_temp
280 |             };
281 |             gpuErrchk(cudaLaunchCooperativeKernel(ls_merit_kernel, knot_points, MERIT_THREADS, kernelArgs, get_merit_smem_size<T>(state_size, knot_points), streams[p]));
282 |         }
283 |         if (sqpTimecheck()){ break; }
284 |         gpuErrchk(cudaPeekAtLastError());
285 |         gpuErrchk(cudaDeviceSynchronize());
286 |         
287 |         
288 |         cudaMemcpy(h_merit_news, d_merit_news, 8*sizeof(T), cudaMemcpyDeviceToHost);
289 |         if (sqpTimecheck()){ break; }
290 | 
291 | 
292 |         line_search_step = 0;
293 |         min_merit = h_merit_initial;
294 |         for(int i = 0; i < 8; i++){
295 |         //     std::cout << h_merit_news[i] << (i == 7 ? "\n" : " ");
296 |             ///TODO: reduction ratio
297 |             if(h_merit_news[i] < min_merit){
298 |                 min_merit = h_merit_news[i];
299 |                 line_search_step = i;
300 |             }
301 |         }
302 | 
303 | 
304 |         if(min_merit == h_merit_initial){
305 |             // line search failure
306 |             drho = max(drho*rho_factor, rho_factor);
307 |             rho = max(rho*drho, rho_min);
308 |             sqp_iter++;
309 |             if(rho > rho_max){
310 |                 sqp_time_exit = 0;
311 |                 rho = rho_reset;
312 |                 break; 
313 |             }
314 |             continue;
315 |         }
316 |         // std::cout << "line search accepted\n";
317 |         alphafinal = -1.0 / (1 << line_search_step);        // alpha sign
318 | 
319 |         drho = min(drho/rho_factor, 1/rho_factor);
320 |         rho = max(rho*drho, rho_min);
321 |         
322 | 
323 | #if USE_DOUBLES
324 |         cublasDaxpy(
325 |             handle, 
326 |             DZ_SIZE_BYTES / sizeof(T),
327 |             &alphafinal,
328 |             d_dz, 1,
329 |             d_xu, 1
330 |         );
331 | #else
332 |         cublasSaxpy(
333 |             handle, 
334 |             DZ_SIZE_BYTES / sizeof(T),
335 |             &alphafinal,
336 |             d_dz, 1,
337 |             d_xu, 1
338 |         );
339 | #endif
340 | 
341 |         gpuErrchk(cudaPeekAtLastError());
342 |         // if success increment after update
343 |         sqp_iter++;
344 | 
345 |         if (sqpTimecheck()){ break; }
346 | 
347 | 
348 |         delta_merit_iter = h_merit_initial - min_merit;
349 |         delta_merit_total += delta_merit_iter;
350 |         
351 | 
352 |         h_merit_initial = min_merit;
353 |     
354 |     }
355 |     
356 |     gpuErrchk(cudaPeekAtLastError());
357 |     gpuErrchk(cudaDeviceSynchronize());
358 |     clock_gettime(CLOCK_MONOTONIC, &sqp_solve_end);
359 | 
360 |     cublasDestroy(handle);
361 | 
362 |     for(uint32_t st=0; st < num_alphas; st++){
363 |         gpuErrchk(cudaStreamDestroy(streams[st]));
364 |     }
365 | 
366 | 
367 | 
368 | 
369 |     gpuErrchk(cudaFree(d_merit_initial));
370 |     gpuErrchk(cudaFree(d_merit_news));
371 |     gpuErrchk(cudaFree(d_merit_temp));
372 |     gpuErrchk(cudaFree(d_G_dense));
373 |     gpuErrchk(cudaFree(d_C_dense));
374 |     gpuErrchk(cudaFree(d_g));
375 |     gpuErrchk(cudaFree(d_c));
376 |     gpuErrchk(cudaFree(d_S));
377 |     gpuErrchk(cudaFree(d_gamma));
378 |     gpuErrchk(cudaFree(d_dz));
379 |     gpuErrchk(cudaFree(d_xs));
380 |     gpuErrchk(cudaFree(d_pcg_iters));
381 |     gpuErrchk(cudaFree(d_pcg_exit));
382 |     gpuErrchk(cudaFree(d_Pinv));
383 |     gpuErrchk(cudaFree(d_r));
384 |     gpuErrchk(cudaFree(d_p));
385 |     gpuErrchk(cudaFree(d_v_temp));
386 |     gpuErrchk(cudaFree(d_eta_new_temp));
387 | 
388 | 
389 | 
390 |     double sqp_solve_time = time_delta_us_timespec(sqp_solve_start, sqp_solve_end);
391 | 
392 |     return std::make_tuple(pcg_iter_vec, linsys_time_vec, sqp_solve_time, sqp_iter, sqp_time_exit, pcg_exit_vec);
393 | }
394 | 


--------------------------------------------------------------------------------
/include/qdldl/linsys_setup.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <cstdint>
  3 | #include "gpuassert.cuh"
  4 | #include "glass.cuh"
  5 | #include "dynamics/rbd_plant.cuh"
  6 | #include "merit.cuh"
  7 | #include "utils/matrix.cuh"
  8 | #include "utils/csr.cuh"
  9 | #include "integrator.cuh"
 10 | #include "qdldl.h"
 11 | 
 12 | template <typename T>
 13 | __global__
 14 | void form_schur_qdl_kernel(uint32_t state_size,
 15 |                             uint32_t control_size,
 16 |                             uint32_t knot_points,
 17 |                             T *d_G,
 18 |                             T *d_C,
 19 |                             T *d_g,
 20 |                             T *d_c,
 21 |                             QDLDL_float *d_val,
 22 |                             T *d_gamma,
 23 |                             T rho)
 24 | {
 25 | 
 26 | 
 27 |     
 28 |     extern __shared__ T s_temp[ ];    
 29 |     const uint32_t states_sq = state_size*state_size;
 30 |     const uint32_t states_p_controls = state_size * control_size;
 31 |     const uint32_t controls_sq = control_size * control_size;
 32 |     const uint32_t states_s_controls = state_size + control_size;
 33 | 
 34 | 
 35 |     for(unsigned blockrow=blockIdx.x; blockrow<knot_points; blockrow+=gridDim.x){
 36 | 
 37 |         //  SPACE ALLOCATION IN SHARED MEM
 38 |         //  | phi_k | theta_k | thetaInv_k | gamma_k | block-specific...
 39 |         //     s^2      s^2         s^2         s
 40 |         T *s_phi_k = s_temp; 	                            	    // phi_k        states^2
 41 |         T *s_theta_k = s_phi_k + states_sq; 			            // theta_k      states^2
 42 |         T *s_thetaInv_k = s_theta_k + states_sq; 			        // thetaInv_k   states^2
 43 |         T *s_gamma_k = s_thetaInv_k + states_sq;                       // gamma_k      states
 44 |         T *s_end_main = s_gamma_k + state_size;                               
 45 | 
 46 |         if(blockrow==0){
 47 | 
 48 |             //  LEADING BLOCK GOAL SHARED MEMORY STATE
 49 |             //  ...gamma_k | . | Q_N_I | q_N | . | Q_0_I | q_0 | scatch
 50 |             //              s^2   s^2     s   s^2   s^2     s      ? 
 51 |         
 52 |             T *s_QN = s_end_main;
 53 |             T *s_QN_i = s_QN + state_size * state_size;
 54 |             T *s_qN = s_QN_i + state_size * state_size;
 55 |             T *s_Q0 = s_qN + state_size;
 56 |             T *s_Q0_i = s_Q0 + state_size * state_size;
 57 |             T *s_q0 = s_Q0_i + state_size * state_size;
 58 |             T *s_end = s_q0 + state_size;
 59 | 
 60 |             // scratch space
 61 |             T *s_R_not_needed = s_end;
 62 |             T *s_r_not_needed = s_R_not_needed + control_size * control_size;
 63 |             T *s_extra_temp = s_r_not_needed + control_size * control_size;
 64 | 
 65 |             __syncthreads();//----------------------------------------------------------------
 66 | 
 67 |             gato_memcpy(s_Q0, d_G, states_sq);
 68 |             gato_memcpy(s_QN, d_G+(knot_points-1)*(states_sq+controls_sq), states_sq);
 69 |             gato_memcpy(s_q0, d_g, state_size);
 70 |             gato_memcpy(s_qN, d_g+(knot_points-1)*(state_size+control_size), state_size);
 71 | 
 72 |             __syncthreads();//----------------------------------------------------------------
 73 | 
 74 |             add_identity<T>(s_Q0, state_size, rho);
 75 |             add_identity<T>(s_QN, state_size, rho);
 76 |             
 77 |             __syncthreads();//----------------------------------------------------------------
 78 |             
 79 |             // SHARED MEMORY STATE
 80 |             // | Q_N | . | q_N | Q_0 | . | q_0 | scatch
 81 |             
 82 |             __syncthreads();//----------------------------------------------------------------
 83 | 
 84 | 
 85 |             // invert Q_N, Q_0
 86 |             loadIdentity<T>( state_size,state_size,s_Q0_i, s_QN_i);
 87 |             __syncthreads();//----------------------------------------------------------------
 88 |             invertMatrix<T>( state_size,state_size,state_size,s_Q0, s_QN, s_extra_temp);
 89 |             
 90 |             __syncthreads();//----------------------------------------------------------------
 91 | 
 92 | 
 93 |             // SHARED MEMORY STATE
 94 |             // | . | Q_N_i | q_N | . | Q_0_i | q_0 | scatch
 95 |             
 96 | 
 97 |             // compute gamma
 98 |             mat_vec_prod<T>( state_size, state_size,
 99 |                 s_Q0_i,                                    
100 |                 s_q0,                                       
101 |                 s_gamma_k 
102 |             );
103 |             __syncthreads();//----------------------------------------------------------------
104 |             
105 |             // save -Q0_i in spot 00 in S
106 |             store_block_csr_lowertri<T>(state_size, knot_points, s_Q0_i, d_val, 1, blockrow, -1);
107 | 
108 |             __syncthreads();//----------------------------------------------------------------
109 | 
110 | 
111 |             // compute Q0^{-1}q0
112 |             mat_vec_prod<T>( state_size, state_size,
113 |                 s_Q0_i,
114 |                 s_q0,
115 |                 s_Q0
116 |             );
117 |             __syncthreads();//----------------------------------------------------------------
118 | 
119 | 
120 |             // SHARED MEMORY STATE
121 |             // | . | Q_N_i | q_N | Q0^{-1}q0 | Q_0_i | q_0 | scatch
122 | 
123 | 
124 |             // save -Q0^{-1}q0 in spot 0 in gamma
125 |             for(unsigned ind = threadIdx.x; ind < state_size; ind += blockDim.x){
126 |                 d_gamma[ind] = -s_Q0[ind];
127 |             }
128 |             __syncthreads();//----------------------------------------------------------------
129 | 
130 |         }
131 |         else{                       // blockrow!=LEAD_BLOCK
132 | 
133 | 
134 |             const unsigned C_set_size = states_sq+states_p_controls;
135 |             const unsigned G_set_size = states_sq+controls_sq;
136 | 
137 |             //  NON-LEADING BLOCK GOAL SHARED MEMORY STATE
138 |             //  ...gamma_k | A_k | B_k | . | Q_k_I | . | Q_k+1_I | . | R_k_I | q_k | q_k+1 | r_k | integrator_error | extra_temp
139 |             //               s^2   s*c  s^2   s^2   s^2    s^2    s^2   s^2     s      s      s          s                <s^2?
140 | 
141 |             T *s_Ak = s_end_main; 								
142 |             T *s_Bk = s_Ak +        states_sq;
143 |             T *s_Qk = s_Bk +        states_p_controls; 	
144 |             T *s_Qk_i = s_Qk +      states_sq;	
145 |             T *s_Qkp1 = s_Qk_i +    states_sq;
146 |             T *s_Qkp1_i = s_Qkp1 +  states_sq;
147 |             T *s_Rk = s_Qkp1_i +    states_sq;
148 |             T *s_Rk_i = s_Rk +      controls_sq;
149 |             T *s_qk = s_Rk_i +      controls_sq; 	
150 |             T *s_qkp1 = s_qk +      state_size; 			
151 |             T *s_rk = s_qkp1 +      state_size;
152 |             T *s_end = s_rk +       control_size;
153 |             
154 |             // scratch
155 |             T *s_extra_temp = s_end;
156 |             
157 | 
158 |             __syncthreads();//----------------------------------------------------------------
159 | 
160 |             gato_memcpy(s_Ak,   d_C+      (blockrow-1)*C_set_size,                        states_sq);
161 |             gato_memcpy(s_Bk,   d_C+      (blockrow-1)*C_set_size+states_sq,              states_p_controls);
162 |             gato_memcpy(s_Qk,   d_G+      (blockrow-1)*G_set_size,                        states_sq);
163 |             gato_memcpy(s_Qkp1, d_G+    (blockrow*G_set_size),                          states_sq);
164 |             gato_memcpy(s_Rk,   d_G+      ((blockrow-1)*G_set_size+states_sq),            controls_sq);
165 |             gato_memcpy(s_qk,   d_g+      (blockrow-1)*(states_s_controls),               state_size);
166 |             gato_memcpy(s_qkp1, d_g+    (blockrow)*(states_s_controls),                 state_size);
167 |             gato_memcpy(s_rk,   d_g+      ((blockrow-1)*(states_s_controls)+state_size),  control_size);
168 | 
169 |             __syncthreads();//----------------------------------------------------------------
170 | 
171 |             add_identity<T>(s_Qk, state_size, rho);
172 |             add_identity<T>(s_Qkp1, state_size, rho);
173 |             add_identity<T>(s_Rk, control_size, rho);
174 |             
175 |             // Invert Q, Qp1, R 
176 |             loadIdentity<T>( state_size,state_size,control_size,
177 |                 s_Qk_i, 
178 |                 s_Qkp1_i, 
179 |                 s_Rk_i
180 |             );
181 |             __syncthreads();//----------------------------------------------------------------
182 |             invertMatrix<T>( state_size,state_size,control_size,state_size,
183 |                 s_Qk, 
184 |                 s_Qkp1, 
185 |                 s_Rk, 
186 |                 s_extra_temp
187 |             );
188 |             __syncthreads();//----------------------------------------------------------------
189 | 
190 |             // save Qk_i into G (now Ginv) for calculating dz
191 |             gato_memcpy(
192 |                 d_G+(blockrow-1)*G_set_size,
193 |                 s_Qk_i,
194 |                 states_sq
195 |             );
196 | 
197 |             // save Rk_i into G (now Ginv) for calculating dz
198 |             gato_memcpy( 
199 |                 d_G+(blockrow-1)*G_set_size+states_sq,
200 |                 s_Rk_i,
201 |                 controls_sq
202 |             );
203 | 
204 |             if(blockrow==knot_points-1){
205 |                 // save Qkp1_i into G (now Ginv) for calculating dz
206 |                 gato_memcpy(
207 |                     d_G+(blockrow)*G_set_size,
208 |                     s_Qkp1_i,
209 |                     states_sq
210 |                 );
211 |             }
212 |             __syncthreads();//----------------------------------------------------------------
213 | 
214 |             // Compute -AQ^{-1} in phi
215 |             glass::gemm<T>(
216 |                 state_size, 
217 |                 state_size, 
218 |                 state_size,
219 |                 static_cast<T>(1.0),
220 |                 s_Ak, 
221 |                 s_Qk_i, 
222 |                 s_phi_k
223 |             );
224 | 
225 |             __syncthreads();//----------------------------------------------------------------
226 | 
227 |             // Compute -BR^{-1} in Qkp1
228 |             glass::gemm<T>(
229 |                 state_size, 
230 |                 control_size, 
231 |                 control_size,
232 |                 static_cast<T>(1.0),
233 |                 s_Bk, 
234 |                 s_Rk_i, 
235 |                 s_Qkp1
236 |             );
237 | 
238 |             __syncthreads();//----------------------------------------------------------------
239 | 
240 |             // compute Q_{k+1}^{-1}q_{k+1} - IntegratorError in gamma
241 |             mat_vec_prod<T>( state_size, state_size,
242 |                 s_Qkp1_i,
243 |                 s_qkp1,
244 |                 s_gamma_k
245 |             );
246 |             for(unsigned i = threadIdx.x; i < state_size; i += blockDim.x){
247 |                 s_gamma_k[i] -= d_c[(blockrow*state_size)+i];
248 |             }
249 |             __syncthreads();//----------------------------------------------------------------
250 | 
251 |             // compute -AQ^{-1}q for gamma         temp storage in extra temp
252 |             mat_vec_prod<T>( state_size, state_size,
253 |                 s_phi_k,
254 |                 s_qk,
255 |                 s_extra_temp
256 |             );
257 |             
258 | 
259 |             __syncthreads();//----------------------------------------------------------------
260 |             
261 |             // compute -BR^{-1}r for gamma           temp storage in extra temp + states
262 |             mat_vec_prod<T>( state_size, control_size,
263 |                 s_Qkp1,
264 |                 s_rk,
265 |                 s_extra_temp + state_size
266 |             );
267 | 
268 |             __syncthreads();//----------------------------------------------------------------
269 |             
270 |             // gamma = yeah...
271 |             for(unsigned i = threadIdx.x; i < state_size; i += blockDim.x){
272 |                 s_gamma_k[i] += s_extra_temp[state_size + i] + s_extra_temp[i]; 
273 |             }
274 |             __syncthreads();//----------------------------------------------------------------
275 | 
276 |             // compute AQ^{-1}AT   -   Qkp1^{-1} for theta
277 |             glass::gemm<T, true>(
278 |                 state_size, 
279 |                 state_size, 
280 |                 state_size,
281 |                 static_cast<T>(1.0),
282 |                 s_phi_k, 
283 |                 s_Ak, 
284 |                 s_theta_k
285 |             );
286 | 
287 |             __syncthreads();//----------------------------------------------------------------
288 | 
289 | 
290 |             for(unsigned i = threadIdx.x; i < states_sq; i += blockDim.x){
291 |                 s_theta_k[i] += s_Qkp1_i[i];
292 |             }
293 |             
294 |             __syncthreads();//----------------------------------------------------------------
295 | 
296 |             // compute BR^{-1}BT for theta            temp storage in QKp1{-1}
297 |             glass::gemm<T, true>(
298 |                 state_size, 
299 |                 control_size,
300 |                 state_size, 
301 |                 static_cast<T>(1.0),
302 |                 s_Qkp1, 
303 |                 s_Bk, 
304 |                 s_Qkp1_i
305 |             );
306 | 
307 |             __syncthreads();//----------------------------------------------------------------
308 | 
309 |             for(unsigned i = threadIdx.x; i < states_sq; i += blockDim.x){
310 |                 s_theta_k[i] += s_Qkp1_i[i];
311 |             }
312 |             __syncthreads();//----------------------------------------------------------------
313 | 
314 |             // // save phi_k into left off-diagonal of S, 
315 |             store_block_csr_lowertri<T>(state_size, knot_points, s_phi_k, d_val, 0, blockrow, -1);
316 |             
317 |             __syncthreads();//----------------------------------------------------------------
318 | 
319 | 
320 |             // save -s_theta_k main diagonal S
321 |             store_block_csr_lowertri<T>(state_size, knot_points, s_theta_k, d_val, 1, blockrow, -1);
322 |             
323 |             __syncthreads();//----------------------------------------------------------------
324 | 
325 |             // save gamma_k in gamma
326 |             for(unsigned ind = threadIdx.x; ind < state_size; ind += blockDim.x){
327 |                 unsigned offset = (blockrow)*state_size + ind;
328 |                 d_gamma[offset] = s_gamma_k[ind]*-1;
329 |             }
330 | 
331 |             __syncthreads();//----------------------------------------------------------------
332 | 
333 |         }
334 |         
335 |     }
336 | }
337 | 
338 | template <typename T>
339 | void form_schur_system_qdldl(uint32_t state_size, uint32_t control_size, uint32_t knot_points,
340 |                 T *d_G_dense, T *d_C_dense, T *d_g, T *d_c, 
341 |                 QDLDL_float *d_val, T *d_gamma,
342 |                 T rho)
343 | {
344 |     const uint32_t s_temp_size =sizeof(T)*(8 * state_size*state_size+   
345 |                                 7 * state_size+ 
346 |                                 state_size * control_size+
347 |                                 3 * control_size + 2 * control_size * control_size + 3);
348 | 
349 |     // form Schur, Pinv
350 |     form_schur_qdl_kernel<T><<<knot_points, SCHUR_THREADS, s_temp_size>>>(state_size, control_size, knot_points, d_G_dense, d_C_dense, d_g, d_c, d_val, d_gamma, rho);
351 |     
352 | }


--------------------------------------------------------------------------------
/include/qdldl/sqp.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <vector>
  3 | #include <numeric>
  4 | #include <algorithm>
  5 | #include <cstdint>
  6 | #include <cublas_v2.h>
  7 | #include <math.h>
  8 | #include <cmath>
  9 | #include <random>
 10 | #include <iomanip>
 11 | #include <cuda_runtime.h>
 12 | #include <tuple>
 13 | #include <time.h>
 14 | #include "qdldl.h"
 15 | #include "qdldl/linsys_setup.cuh"
 16 | #include "merit.cuh"
 17 | #include "settings.cuh"
 18 | #include "kkt.cuh"
 19 | #include "dz.cuh"
 20 | 
 21 | 
 22 | __host__
 23 | void qdldl_solve_schur(const QDLDL_int An,
 24 | 					   QDLDL_int *h_col_ptr, QDLDL_int *h_row_ind, QDLDL_float *Ax, QDLDL_float *b, 
 25 | 					   QDLDL_float *h_lambda,
 26 | 					   QDLDL_int *Lp, QDLDL_int *Li, QDLDL_float *Lx, QDLDL_float *D, QDLDL_float *Dinv, QDLDL_int *Lnz, QDLDL_int *etree, QDLDL_bool *bwork, QDLDL_int *iwork, QDLDL_float *fwork){
 27 | 
 28 | 	
 29 | 
 30 | 
 31 | 
 32 |     QDLDL_int i;
 33 | 
 34 | 	const QDLDL_int *Ap = h_col_ptr;
 35 | 	const QDLDL_int *Ai = h_row_ind;
 36 | 
 37 |     //data for L and D factors
 38 | 	QDLDL_int Ln = An;
 39 | 
 40 | 
 41 | 	//Data for results of A\b
 42 | 	QDLDL_float *x = h_lambda;
 43 | 
 44 | 	QDLDL_factor(An,Ap,Ai,Ax,Lp,Li,Lx,D,Dinv,Lnz,etree,bwork,iwork,fwork);
 45 | 
 46 | 	for(i=0;i < Ln; i++) x[i] = b[i];
 47 | 
 48 | 	QDLDL_solve(Ln,Lp,Li,Lx,Dinv,x);
 49 | }
 50 | 
 51 | 
 52 | template <typename T>
 53 | auto sqpSolveQdldl(uint32_t state_size, uint32_t control_size, uint32_t knot_points, float timestep, T *d_eePos_traj, T *d_lambda, T *d_xu, void *d_dynMem_const, T &rho, T rho_reset){
 54 |     
 55 |     // data storage
 56 |     std::vector<int> linsys_iter_vec;
 57 |     std::vector<bool> linsys_exit_vec;
 58 |     std::vector<double> linsys_time_vec;
 59 |     bool sqp_time_exit = 1;     // for data recording, not a flag
 60 |     
 61 | 
 62 | 
 63 |     // sqp timing
 64 |     struct timespec sqp_solve_start, sqp_solve_end;
 65 |     gpuErrchk(cudaDeviceSynchronize());
 66 |     clock_gettime(CLOCK_MONOTONIC, &sqp_solve_start);
 67 | 
 68 | 
 69 |     const uint32_t states_sq = state_size*state_size;
 70 |     const uint32_t states_p_controls = state_size * control_size;
 71 |     const uint32_t controls_sq = control_size * control_size;
 72 |     const uint32_t states_s_controls = state_size + control_size;
 73 |     const uint32_t KKT_G_DENSE_SIZE_BYTES = static_cast<uint32_t>(((states_sq+controls_sq)*knot_points-controls_sq)*sizeof(T));
 74 |     const uint32_t KKT_C_DENSE_SIZE_BYTES = static_cast<uint32_t>((states_sq+states_p_controls)*(knot_points-1)*sizeof(T));
 75 |     const uint32_t KKT_g_SIZE_BYTES       = static_cast<uint32_t>(((state_size+control_size)*knot_points-control_size)*sizeof(T));
 76 |     const uint32_t KKT_c_SIZE_BYTES       =   static_cast<uint32_t>((state_size*knot_points)*sizeof(T));     
 77 |     const uint32_t DZ_SIZE_BYTES          =   static_cast<uint32_t>((states_s_controls*knot_points-control_size)*sizeof(T));
 78 | 
 79 | 
 80 |     // line search things
 81 |     const float mu = 10.0f;
 82 |     const uint32_t num_alphas = 8;
 83 |     T h_merit_news[num_alphas];
 84 |     void *ls_merit_kernel = (void *) ls_gato_compute_merit<T>;
 85 |     const size_t merit_smem_size = get_merit_smem_size<T>(state_size, control_size);
 86 |     T h_merit_initial, min_merit;
 87 |     T alphafinal;
 88 |     T delta_merit_iter = 0;
 89 |     T delta_merit_total = 0;
 90 |     uint32_t line_search_step = 0;
 91 | 
 92 | 
 93 |     // streams n cublas init
 94 |     cudaStream_t streams[num_alphas];
 95 |     for(uint32_t str = 0; str < num_alphas; str++){
 96 |         cudaStreamCreate(&streams[str]);
 97 |     }
 98 |     gpuErrchk(cudaPeekAtLastError());
 99 | 
100 |     cublasHandle_t handle;
101 |     if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); exit(13); }
102 |     gpuErrchk(cudaPeekAtLastError());
103 | 
104 | 
105 |     uint32_t sqp_iter = 0;
106 | 
107 | 
108 | 
109 |     T *d_merit_initial, *d_merit_news, *d_merit_temp,
110 |           *d_G_dense, *d_C_dense, *d_g, *d_c, *d_Ginv_dense,
111 |           *d_S, *d_gamma,
112 |           *d_dz,
113 |           *d_xs;
114 | 
115 |     
116 |     T drho = 1.0;
117 |     T rho_factor = RHO_FACTOR;
118 |     T rho_max = RHO_MAX;
119 |     T rho_min = RHO_MIN;
120 | 
121 |     
122 | 
123 | 
124 |     gpuErrchk(cudaMalloc(&d_G_dense,  KKT_G_DENSE_SIZE_BYTES));
125 |     gpuErrchk(cudaMalloc(&d_C_dense,  KKT_C_DENSE_SIZE_BYTES));
126 |     gpuErrchk(cudaMalloc(&d_g,        KKT_g_SIZE_BYTES));
127 |     gpuErrchk(cudaMalloc(&d_c,        KKT_c_SIZE_BYTES));
128 |     d_Ginv_dense = d_G_dense;
129 | 
130 |     gpuErrchk(cudaMalloc(&d_S, 3*states_sq*knot_points*sizeof(T)));
131 |     gpuErrchk(cudaMalloc(&d_gamma, state_size*knot_points*sizeof(T)));
132 |     gpuErrchk(cudaPeekAtLastError());
133 | 
134 |     
135 |     gpuErrchk(cudaMalloc(&d_dz,       DZ_SIZE_BYTES));
136 |     gpuErrchk(cudaMalloc(&d_xs,       state_size*sizeof(T)));
137 |     gpuErrchk(cudaMemcpy(d_xs, d_xu,  state_size*sizeof(T), cudaMemcpyDeviceToDevice));
138 |     gpuErrchk(cudaMalloc(&d_merit_news, 8*sizeof(T)));
139 |     gpuErrchk(cudaMalloc(&d_merit_temp, 8*knot_points*sizeof(T)));
140 |     // linsys iterates
141 | 
142 |     gpuErrchk(cudaMalloc(&d_merit_initial, sizeof(T)));
143 |     gpuErrchk(cudaMemset(d_merit_initial, 0, sizeof(T)));
144 |     
145 | 
146 | 
147 | 
148 |     const int nnz = (knot_points-1)*states_sq + knot_points*(((state_size+1)*state_size)/2);
149 |     
150 |     QDLDL_float h_lambda[state_size*knot_points];
151 |     QDLDL_float h_gamma[state_size*knot_points];
152 |     QDLDL_int h_col_ptr[state_size*knot_points+1];
153 |     QDLDL_int h_row_ind[nnz];
154 |     QDLDL_float h_val[nnz];
155 |     
156 |     QDLDL_int *d_row_ind, *d_col_ptr;
157 |     QDLDL_float *d_val, *d_lambda_double;
158 |     gpuErrchk(cudaMalloc(&d_col_ptr, (state_size*knot_points+1)*sizeof(QDLDL_int)));
159 |     gpuErrchk(cudaMalloc(&d_row_ind, nnz*sizeof(QDLDL_int)));
160 | 	gpuErrchk(cudaMalloc(&d_val, nnz*sizeof(QDLDL_float)));
161 | 	gpuErrchk(cudaMalloc(&d_lambda_double, (state_size*knot_points)*sizeof(QDLDL_float)));
162 |     
163 |     // fill col ptr and row ind, these won't change 
164 |     prep_csr<<<knot_points, 64>>>(state_size, knot_points, d_col_ptr, d_row_ind);
165 |     gpuErrchk(cudaMemcpy(h_col_ptr, d_col_ptr, (state_size*knot_points+1)*sizeof(QDLDL_int), cudaMemcpyDeviceToHost));
166 |     gpuErrchk(cudaMemcpy(h_row_ind, d_row_ind, (nnz)*sizeof(QDLDL_int), cudaMemcpyDeviceToHost));
167 | 
168 |     
169 |     const QDLDL_int An = state_size*knot_points;
170 | 
171 |     // Q things
172 |     QDLDL_int  sumLnz;
173 |     QDLDL_int *etree;
174 | 	QDLDL_int *Lnz;
175 |     etree = (QDLDL_int*)malloc(sizeof(QDLDL_int)*An);
176 | 	Lnz   = (QDLDL_int*)malloc(sizeof(QDLDL_int)*An);
177 |     
178 |     QDLDL_int *Lp;
179 | 	QDLDL_float *D;
180 | 	QDLDL_float *Dinv;
181 |     Lp    = (QDLDL_int*)malloc(sizeof(QDLDL_int)*(An+1));
182 | 	D     = (QDLDL_float*)malloc(sizeof(QDLDL_float)*An);
183 | 	Dinv  = (QDLDL_float*)malloc(sizeof(QDLDL_float)*An);
184 | 
185 |     //working data for factorisation
186 | 	QDLDL_int   *iwork;
187 | 	QDLDL_bool  *bwork;
188 | 	QDLDL_float *fwork;
189 |     iwork = (QDLDL_int*)malloc(sizeof(QDLDL_int)*(3*An));
190 | 	bwork = (QDLDL_bool*)malloc(sizeof(QDLDL_bool)*An);
191 | 	fwork = (QDLDL_float*)malloc(sizeof(QDLDL_float)*An);
192 | 
193 |     sumLnz = QDLDL_etree(An,h_col_ptr,h_row_ind,iwork,Lnz,etree);
194 |     
195 |     QDLDL_int *Li;
196 | 	QDLDL_float *Lx;
197 |     Li    = (QDLDL_int*)malloc(sizeof(QDLDL_int)*sumLnz);
198 | 	Lx    = (QDLDL_float*)malloc(sizeof(QDLDL_float)*sumLnz);
199 | 
200 |     gpuErrchk(cudaPeekAtLastError());
201 |     gpuErrchk(cudaDeviceSynchronize());
202 | #if TIME_LINSYS == 1
203 |     struct timespec linsys_start, linsys_end;
204 |     double linsys_time;
205 | #endif
206 | #if CONST_UPDATE_FREQ
207 |     struct timespec sqp_cur;
208 |     auto sqpTimecheck = [&]() {
209 |         clock_gettime(CLOCK_MONOTONIC, &sqp_cur);
210 |         return time_delta_us_timespec(sqp_solve_start,sqp_cur) > SQP_MAX_TIME_US;
211 |     };
212 | #else
213 |     auto sqpTimecheck = [&]() { return false; };
214 | #endif
215 | 
216 | 
217 |     ///TODO: atomic race conditions here aren't fixed but don't seem to be problematic
218 |     compute_merit<T><<<knot_points, MERIT_THREADS, merit_smem_size>>>(
219 |         state_size, control_size, knot_points,
220 |         d_xu, 
221 |         d_eePos_traj, 
222 |         static_cast<T>(10), 
223 |         timestep, 
224 |         d_dynMem_const, 
225 |         d_merit_initial
226 |     );
227 |     gpuErrchk(cudaMemcpyAsync(&h_merit_initial, d_merit_initial, sizeof(T), cudaMemcpyDeviceToHost));
228 |     gpuErrchk(cudaPeekAtLastError());
229 | 
230 |     // gpuErrchk(cudaDeviceSynchronize());
231 |     // std::cout << "initial merit " << h_merit_initial << std::endl;
232 |     // exit(0);
233 | 
234 |     //
235 |     //      SQP LOOP
236 |     //
237 |     for(uint32_t sqpiter = 0; sqpiter < SQP_MAX_ITER; sqpiter++){
238 |         
239 |         generate_kkt_submatrices<T><<<knot_points, KKT_THREADS, 2 * get_kkt_smem_size<T>(state_size, control_size)>>>(
240 |             state_size,
241 |             control_size,
242 |             knot_points,
243 |             d_G_dense, 
244 |             d_C_dense, 
245 |             d_g, 
246 |             d_c,
247 |             d_dynMem_const,
248 |             timestep,
249 |             d_eePos_traj,
250 |             d_xs,
251 |             d_xu
252 |         );
253 |         gpuErrchk(cudaPeekAtLastError());
254 |         if (sqpTimecheck()){ break; }
255 | 
256 | 
257 |         form_schur_system_qdldl<T>(state_size, control_size, knot_points, d_G_dense, d_C_dense, d_g, d_c, d_val, d_gamma, rho);
258 |         gpuErrchk(cudaPeekAtLastError());
259 |         if (sqpTimecheck()){ break; }
260 | 
261 |     #if TIME_LINSYS == 1
262 |         gpuErrchk(cudaDeviceSynchronize());
263 |         if (sqpTimecheck()){ break; }
264 |         clock_gettime(CLOCK_MONOTONIC, &linsys_start);
265 |     #endif // #if TIME_LINSYS
266 | 
267 | 
268 |         gpuErrchk(cudaMemcpy(h_val, d_val, (nnz)*sizeof(T), cudaMemcpyDeviceToHost));
269 |         gpuErrchk(cudaMemcpy(h_gamma, d_gamma, (state_size*knot_points)*sizeof(T), cudaMemcpyDeviceToHost))
270 | 
271 |         qdldl_solve_schur(An, h_col_ptr, h_row_ind, h_val, h_gamma, h_lambda, Lp, Li, Lx, D, Dinv, Lnz, etree, bwork, iwork, fwork);
272 |         
273 |         gpuErrchk(cudaMemcpy(d_lambda, h_lambda, (state_size*knot_points)*sizeof(T), cudaMemcpyHostToDevice));
274 | 
275 | 
276 |     #if TIME_LINSYS == 1
277 |         gpuErrchk(cudaDeviceSynchronize());
278 |         clock_gettime(CLOCK_MONOTONIC, &linsys_end);
279 |         
280 |         linsys_time = time_delta_us_timespec(linsys_start, linsys_end);
281 |         linsys_time_vec.push_back(linsys_time);
282 |     #endif // #if TIME_LINSYS
283 |         
284 |         if (sqpTimecheck()){ break; }
285 |         
286 |         // recover dz
287 |         compute_dz(
288 |             state_size,
289 |             control_size,
290 |             knot_points,
291 |             d_Ginv_dense, 
292 |             d_C_dense, 
293 |             d_g, 
294 |             d_lambda, 
295 |             d_dz
296 |         );
297 |         gpuErrchk(cudaPeekAtLastError());
298 |         if (sqpTimecheck()){ break; }
299 |         
300 | 
301 |         // line search
302 |         for(uint32_t p = 0; p < num_alphas; p++){
303 |             void *kernelArgs[] = {
304 |                 (void *)&state_size,
305 |                 (void *)&control_size,
306 |                 (void *)&knot_points,
307 |                 (void *)&d_xs,
308 |                 (void *)&d_xu,
309 |                 (void *)&d_eePos_traj,
310 |                 (void *)&mu, 
311 |                 (void *)&timestep,
312 |                 (void *)&d_dynMem_const,
313 |                 (void *)&d_dz,
314 |                 (void *)&p,
315 |                 (void *)&d_merit_news,
316 |                 (void *)&d_merit_temp
317 |             };
318 |             gpuErrchk(cudaLaunchCooperativeKernel(ls_merit_kernel, knot_points, MERIT_THREADS, kernelArgs, get_merit_smem_size<T>(state_size, knot_points), streams[p]));
319 |         }
320 |         if (sqpTimecheck()){ break; }
321 |         gpuErrchk(cudaPeekAtLastError());
322 |         gpuErrchk(cudaDeviceSynchronize());
323 |         
324 |         
325 |         cudaMemcpy(h_merit_news, d_merit_news, 8*sizeof(T), cudaMemcpyDeviceToHost);
326 |         if (sqpTimecheck()){ break; }
327 | 
328 | 
329 |         line_search_step = 0;
330 |         min_merit = h_merit_initial;
331 |         for(int i = 0; i < 8; i++){
332 |         //     std::cout << h_merit_news[i] << (i == 7 ? "\n" : " ");
333 |             ///TODO: reduction ratio
334 |             if(h_merit_news[i] < min_merit){
335 |                 min_merit = h_merit_news[i];
336 |                 line_search_step = i;
337 |             }
338 |         }
339 | 
340 | 
341 |         if(min_merit == h_merit_initial){
342 |             // line search failure
343 |             drho = max(drho*rho_factor, rho_factor);
344 |             rho = max(rho*drho, rho_min);
345 |             sqp_iter++;
346 |             if(rho > rho_max){
347 |                 sqp_time_exit = 0;
348 |                 rho = rho_reset;
349 |                 break; 
350 |             }
351 |             continue;
352 |         }
353 |         // std::cout << "line search accepted\n";
354 |         alphafinal = -1.0 / (1 << line_search_step);        // alpha sign
355 | 
356 |         drho = min(drho/rho_factor, 1/rho_factor);
357 |         rho = max(rho*drho, rho_min);
358 |         
359 | 
360 | #if USE_DOUBLES
361 |         cublasDaxpy(
362 |             handle, 
363 |             DZ_SIZE_BYTES / sizeof(T),
364 |             &alphafinal,
365 |             d_dz, 1,
366 |             d_xu, 1
367 |         );
368 | #else
369 |         cublasSaxpy(
370 |             handle, 
371 |             DZ_SIZE_BYTES / sizeof(T),
372 |             &alphafinal,
373 |             d_dz, 1,
374 |             d_xu, 1
375 |         );
376 | #endif
377 | 
378 |         gpuErrchk(cudaPeekAtLastError());
379 |         // if success increment after update
380 |         sqp_iter++;
381 | 
382 |         if (sqpTimecheck()){ break; }
383 | 
384 | 
385 |         delta_merit_iter = h_merit_initial - min_merit;
386 |         delta_merit_total += delta_merit_iter;
387 |         
388 | 
389 |         h_merit_initial = min_merit;
390 |     
391 |     }
392 |     
393 |     gpuErrchk(cudaPeekAtLastError());
394 |     gpuErrchk(cudaDeviceSynchronize());
395 |     clock_gettime(CLOCK_MONOTONIC, &sqp_solve_end);
396 | 
397 |     cublasDestroy(handle);
398 | 
399 |     for(uint32_t st=0; st < num_alphas; st++){
400 |         gpuErrchk(cudaStreamDestroy(streams[st]));
401 |     }
402 | 
403 | 
404 | 
405 | 
406 |     gpuErrchk(cudaFree(d_merit_initial));
407 |     gpuErrchk(cudaFree(d_merit_news));
408 |     gpuErrchk(cudaFree(d_merit_temp));
409 |     gpuErrchk(cudaFree(d_G_dense));
410 |     gpuErrchk(cudaFree(d_C_dense));
411 |     gpuErrchk(cudaFree(d_g));
412 |     gpuErrchk(cudaFree(d_c));
413 |     gpuErrchk(cudaFree(d_S));
414 |     gpuErrchk(cudaFree(d_gamma));
415 |     gpuErrchk(cudaFree(d_dz));
416 |     gpuErrchk(cudaFree(d_xs));
417 |     gpuErrchk(cudaFree(d_col_ptr));
418 |     gpuErrchk(cudaFree(d_row_ind));
419 |     gpuErrchk(cudaFree(d_val));
420 |     gpuErrchk(cudaFree(d_lambda_double));
421 | 	free(etree);
422 | 	free(Lnz);
423 |     free(Lp);
424 | 	free(D);
425 | 	free(Dinv);
426 | 	free(iwork);
427 | 	free(bwork);
428 | 	free(fwork);
429 | 	free(Li);
430 | 	free(Lx);
431 | 
432 |     double sqp_solve_time = time_delta_us_timespec(sqp_solve_start, sqp_solve_end);
433 | 
434 |     return std::make_tuple(linsys_iter_vec, linsys_time_vec, sqp_solve_time, sqp_iter, sqp_time_exit, linsys_exit_vec);
435 | }
436 | 


--------------------------------------------------------------------------------
/include/utils/csr.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include <cstdint>
 3 | #include <cooperative_groups.h>
 4 | #include "glass.cuh"
 5 | #include "qdldl.h"
 6 | #include <fstream>
 7 | 
 8 | 
 9 | // fills in the values of the lower triangle of a symmetric block tridiagonal matrix
10 | template <typename T>
11 | __device__
12 | void store_block_csr_lowertri(uint32_t bdim, uint32_t mdim, T *d_src, QDLDL_float *d_val, bool col1, uint32_t bd_block_row, int32_t multiplier=1){
13 |     
14 |     const int brow_val_ct = bdim*bdim + ((bdim+1)*bdim)/2;
15 |     int row, col, csr_row_offset, full_csr_offset;
16 |     int write_len;
17 |     int cur_triangle_offset;
18 | 
19 |     for(row = threadIdx.x; row < bdim; row += blockDim.x){
20 | 
21 | 
22 |         cur_triangle_offset = ((row+1)*row)/2;
23 |         csr_row_offset = (bd_block_row>0)*((bdim+1)*bdim)/2 +                   // add triangle if not first block row
24 |                          (bd_block_row>0) * (bd_block_row-1)*brow_val_ct +      // add previous full block rows if not first block row
25 |                          (bd_block_row>0)*row*bdim +                            // 
26 |                          cur_triangle_offset;                                   // triangle offset
27 | 
28 | 
29 |         write_len = (bd_block_row>0)*((!col1)*(bdim)+(col1)*(row+1)) + (col1)*(bd_block_row==0)*(row+1);
30 |         
31 |         for(col = 0; col<write_len; col++){
32 |             full_csr_offset = csr_row_offset + (bd_block_row>0)*(col1)*bdim + col;
33 |             d_val[full_csr_offset] = static_cast<QDLDL_float>(d_src[row + col*bdim]) * multiplier;
34 |         }
35 |     }
36 | }
37 | 
38 | 
39 | // fills in the column pointers and row indices for the CSR representation of the lower triangle of a symmetric block tridiagonal matrix
40 | __global__
41 | void prep_csr(uint32_t state_size, uint32_t knot_points, QDLDL_int *d_col_ptr, QDLDL_int *d_row_ind){
42 |     
43 |     for (uint32_t blockrow = blockIdx.x; blockrow < knot_points; blockrow+=gridDim.x)
44 |     {
45 |         const int brow_val_ct = state_size*state_size + ((state_size+1)*state_size)/2;
46 |         int row, col, csr_row_offset, full_csr_offset, bd_row_len;
47 |         int cur_triangle_offset;
48 | 
49 |         for(row = threadIdx.x; row < state_size; row += blockDim.x){
50 | 
51 | 
52 |             if(blockrow==0 && row==0){
53 |                 d_col_ptr[0] = 0;
54 |             }
55 |             
56 |             cur_triangle_offset = ((row+1)*row)/2;
57 |             csr_row_offset = (blockrow>0)*((state_size+1)*state_size)/2 +                   // add triangle if not first block row
58 |                             (blockrow>0) * (blockrow-1)*brow_val_ct +      // add previous full block rows if not first block row
59 |                             (blockrow>0)*row*state_size +                            // 
60 |                             cur_triangle_offset;                                   // triangle offset
61 | 
62 | 
63 |             bd_row_len = (blockrow>0)*state_size + row+1;
64 |             d_col_ptr[blockrow*state_size + row+1] = csr_row_offset+bd_row_len;
65 |             
66 |             for(col = 0; col < bd_row_len; col++){
67 |                 full_csr_offset = csr_row_offset + col;
68 |                 d_row_ind[full_csr_offset] = (blockrow>0)*(blockrow-1)*state_size + col;
69 |             }
70 | 
71 |         }
72 |     }
73 |     
74 | }


--------------------------------------------------------------------------------
/include/utils/experiment.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <stdio.h>
  4 | #include <cstring>
  5 | #include <random>
  6 | #include <vector>
  7 | #include <numeric>
  8 | #include <algorithm>
  9 | #include <thread>
 10 | #include <time.h>
 11 | #include <pthread.h>
 12 | #include <iostream>
 13 | 
 14 | #define time_delta_us_timespec(start,end) (1e6*static_cast<double>(end.tv_sec - start.tv_sec)+1e-3*static_cast<double>(end.tv_nsec - start.tv_nsec))
 15 | 
 16 | template<bool PRINT_DISTRIBUTION = true>
 17 | void printStats(std::vector<double> *times){
 18 |    double sum = std::accumulate(times->begin(), times->end(), 0.0);
 19 |    double mean = sum/static_cast<double>(times->size());
 20 |    std::vector<double> diff(times->size());
 21 |    std::transform(times->begin(), times->end(), diff.begin(), [mean](double x) {return x - mean;});
 22 |    double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
 23 |    double stdev = std::sqrt(sq_sum / times->size());
 24 |    std::vector<double>::iterator minInd = std::min_element(times->begin(), times->end());
 25 |    std::vector<double>::iterator maxInd = std::max_element(times->begin(), times->end());
 26 |    double min = times->at(std::distance(times->begin(), minInd)); 
 27 |    double max = times->at(std::distance(times->begin(), maxInd));
 28 |    printf("Average[%fus] Std Dev [%fus] Min [%fus] Max [%fus] \n",mean,stdev,min,max);
 29 |    if (PRINT_DISTRIBUTION){
 30 |       double hist[] = {0,0,0,0,0,0,0};
 31 |       for(int i = 0; i < times->size(); i++){
 32 |          double value = times->at(i);
 33 |          if (value < mean - stdev){
 34 |             if (value < mean - 2*stdev){
 35 |                if (value < mean - 3*stdev){hist[0] += 1.0;}
 36 |                else{hist[1] += 1.0;}
 37 |             }
 38 |             else{hist[2] += 1.0;}
 39 |          }
 40 |          else if (value > mean + stdev){
 41 |             if (value > mean + 2*stdev){
 42 |                if (value > mean + 3*stdev){hist[6] += 1.0;}
 43 |                else{hist[5] += 1.0;}
 44 |             }
 45 |             else{hist[4] += 1.0;}
 46 |          }
 47 |          else{hist[3] += 1.0;}
 48 |       }
 49 |       for(int i = 0; i < 7; i++){hist[i] = (hist[i]/static_cast<double>(times->size()))*100;}
 50 |       printf("    Distribution |  -3  |  -2  |  -1  |   0  |   1  |   2  |   3  |\n");
 51 |       printf("    (X std dev)  | %2.2f | %2.2f | %2.2f | %2.2f | %2.2f | %2.2f | %2.2f |\n",
 52 |                                 hist[0],hist[1],hist[2],hist[3],hist[4],hist[5],hist[6]);
 53 |       std::sort(times->begin(), times->end()); 
 54 |       printf("    Percentiles |  50   |  60   |  70   |  75   |  80   |  85   |  90   |  95   |  99   |\n");
 55 |       printf("                | %.2f | %.2f | %.2f | %.2f | %.2f | %.2f | %.2f | %.2f | %.2f |\n",
 56 |                               times->at(times->size()/2),times->at(times->size()/5*3),times->at(times->size()/10*7),
 57 |                               times->at(times->size()/4*3),times->at(times->size()/5*4),times->at(times->size()/20*17),
 58 |                               times->at(times->size()/10*9),times->at(times->size()/20*19),times->at(times->size()/100*99));
 59 |       bool onePer = false; bool twoPer = false; bool fivePer = false; bool tenPer = false;
 60 |       for(int i = 0; i < times->size(); i++){
 61 |          if(!onePer && times->at(i) >= mean * 1.01){ onePer = true;
 62 |             printf("    More than 1 Percent above mean at [%2.2f] Percentile\n",static_cast<double>(i)/static_cast<double>(times->size())*100.0);
 63 |          }
 64 |          if(!twoPer && times->at(i) >= mean * 1.02){ twoPer = true;
 65 |             printf("    More than 2 Percent above mean at [%2.2f] Percentile\n",static_cast<double>(i)/static_cast<double>(times->size())*100.0);
 66 |          }
 67 |          if(!fivePer && times->at(i) >= mean * 1.05){ fivePer = true;
 68 |             printf("    More than 5 Percent above mean at [%2.2f] Percentile\n",static_cast<double>(i)/static_cast<double>(times->size())*100.0);
 69 |          }
 70 |          if(!tenPer && times->at(i) >= mean * 1.10){ tenPer = true;
 71 |             printf("    More than 10 Percent above mean at [%2.2f] Percentile\n",static_cast<double>(i)/static_cast<double>(times->size())*100.0);
 72 |          }
 73 |       }
 74 |    }
 75 | }
 76 | 
 77 | std::string getCurrentTimestamp() {
 78 |    time_t rawtime;
 79 |    struct tm * timeinfo;
 80 |    char buffer[80];
 81 |    time(&rawtime);
 82 |    timeinfo = localtime(&rawtime);
 83 |    strftime(buffer, sizeof(buffer), "%Y%m%d_%H%M%S", timeinfo);
 84 |    std::string timestampStr(buffer);
 85 |    return timestampStr;
 86 | }
 87 | 
 88 | // Function to format stats string values into CSV format
 89 | std::string getStatsString(const std::string& statsString) {
 90 |    std::stringstream ss(statsString);
 91 |    std::string token;
 92 |    std::string csvFormattedString;
 93 |    
 94 |    while (getline(ss, token, '[')) {
 95 |        if (getline(ss, token, ']')) {
 96 |            if (!csvFormattedString.empty()) {
 97 |                csvFormattedString += ",";
 98 |            }
 99 |            csvFormattedString += token;
100 |        }
101 |    }
102 |    
103 |    return csvFormattedString;
104 | }
105 | 
106 | template<typename T>
107 | std::string printStats(std::vector<T> *data, std::string prefix = "data"){
108 |    T sum = std::accumulate(data->begin(), data->end(), static_cast<T>(0));
109 |    float mean = sum/static_cast<double>(data->size());
110 |    std::vector<T> diff(data->size());
111 |    std::transform(data->begin(), data->end(), diff.begin(), [mean](T x) {return x - mean;});
112 |    T sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0);
113 |    T stdev = std::sqrt(sq_sum / data->size());
114 |    typename std::vector<T>::iterator minInd = std::min_element(data->begin(), data->end());
115 |    typename std::vector<T>::iterator maxInd = std::max_element(data->begin(), data->end());
116 |    T min = data->at(std::distance(data->begin(), minInd)); 
117 |    T max = data->at(std::distance(data->begin(), maxInd));
118 | 
119 |    // Now also want to sort and get median, first and third quartile for variance plot
120 |    std::vector<T> sortedData(*data);
121 |    std::sort(sortedData.begin(), sortedData.end());
122 | 
123 |    std::cout << std::endl;
124 |    T median, Q1, Q3;
125 |    size_t n = sortedData.size();
126 |    if (n % 2 == 0) {
127 |       median = (sortedData[n/2 - 1] + sortedData[n/2]) / 2.0;
128 |       Q1 = (sortedData[n/4 - 1] + sortedData[n/4]) / 2.0;
129 |       Q3 = (sortedData[3*n/4 - 1] + sortedData[3*n/4]) / 2.0;
130 |    } else {
131 |       median = sortedData[n/2];
132 |       Q1 = sortedData[n/4];
133 |       Q3 = sortedData[3*n/4];
134 |    }
135 |    std::cout << "Average[" << mean << "] Std Dev [" << stdev << "] Min [" << min << "] Max [" << max << "] Median [" << median << "] Q1 [" << Q1 << "] Q3 [" << Q3 << "]" << std::endl;
136 | 
137 |    // Construct the formatted string
138 |    std::stringstream ss;
139 |    ss << "Average[" << mean << "] Std Dev [" << stdev << "] Min [" << min << "] Max [" << max << "] Median [" << median << "] Q1 [" << Q1 << "] Q3 [" << Q3 << "]";
140 |    
141 |    return ss.str();
142 | }
143 | 
144 | template <typename T>
145 | std::vector<std::vector<T>> readCSVToVecVec(const std::string& filename) {
146 |     std::vector<std::vector<T>> data;
147 |     std::ifstream infile(filename);
148 | 
149 |     if (!infile.is_open()) {
150 |         std::cerr << "File [ " << filename << " ] could not be opened!\n";
151 |     } else {
152 |         std::string line;
153 | 
154 | 
155 |         while (std::getline(infile, line)) {
156 |             std::vector<T> row;
157 |             std::stringstream ss(line);
158 |             std::string val;
159 | 
160 |             while (std::getline(ss, val, ',')) {
161 |                 row.push_back(std::stof(val));
162 |             }
163 | 
164 |             data.push_back(row);
165 |         }
166 |     }
167 | 
168 |     infile.close();
169 |     return data;
170 | }


--------------------------------------------------------------------------------
/include/utils/matrix.cuh:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <cstdint>
  3 | // TODO: GBD-PCG utils include fix
  4 | #include "utils.cuh"
  5 | 
  6 |     
  7 | 
  8 | 
  9 | template <typename T>
 10 | __device__
 11 | void gato_ATx(T *out, T *mat, T *vec, int m, int n){
 12 | 
 13 |     T res;
 14 |     int ind, thing;
 15 | 
 16 |     for(ind=threadIdx.x; ind < n; ind +=blockDim.x){
 17 | 
 18 |         res = 0;
 19 |         for(thing=0; thing<m; thing++){
 20 |             res += mat[ind*m+thing] * vec[thing];
 21 |         }
 22 | 
 23 |         out[ind] = res;
 24 |     }
 25 | }
 26 | 
 27 | template <typename T>
 28 | __device__
 29 | void gato_vec_dif(T *out, T *vec1, T *vec2, int size){
 30 |     for(int i = threadIdx.x; i < size; i+= blockDim.x){
 31 |         out[i] = vec1[i] - vec2[i];
 32 |     }
 33 | }
 34 | 
 35 | template <typename T>
 36 | __device__
 37 | void gato_vec_sum(T *out, T *vec1, T *vec2, int size){
 38 |     for(int i = threadIdx.x; i < size; i+= blockDim.x){
 39 |         out[i] = vec1[i] + vec2[i];
 40 |     }
 41 | }
 42 | 
 43 | 
 44 | template <typename T>
 45 | __device__
 46 | void mat_vec_prod(unsigned MAT_ROWS, unsigned MAT_COLS, T *mat, T *vec, T *out){
 47 |     
 48 |     for(unsigned row=threadIdx.x; row<MAT_ROWS; row+=blockDim.x){
 49 |         T res = static_cast<T>(0);
 50 |         for (unsigned col = 0; col < MAT_COLS; col++){
 51 |             res += mat[row + col*MAT_ROWS] * vec[col];
 52 |         }
 53 |         out[row] = res;
 54 |     }
 55 | }
 56 | 
 57 | template <typename T>
 58 | __device__
 59 | void add_identity(T *A, unsigned dim, T factor){
 60 |     for(unsigned i = threadIdx.x; i < dim*dim; i+=blockDim.x){
 61 |         if(i/dim == i%dim){ A[i] += factor; }
 62 |     }
 63 | }
 64 | 
 65 | 
 66 | 
 67 | // load identity in so memory is [A | I]
 68 | template <typename T>
 69 | __device__ __forceinline__
 70 | void loadIdentity(uint32_t DIM, T *A){
 71 |     for (unsigned ind = threadIdx.x; ind < DIM*DIM; ind += blockDim.x){
 72 |         unsigned r, c;
 73 |         r = ind % DIM; 
 74 |         c = ind / DIM;
 75 |         A[ind] = static_cast<T>(r == c);
 76 |     }
 77 | }
 78 | 
 79 | // load identity in so memory is [V | I]
 80 | template <typename T>
 81 | __device__ __forceinline__
 82 | void loadIdentity(uint32_t DIMA, uint32_t DIMB, T *A, T *B){
 83 |     for (unsigned ind = threadIdx.x; ind < DIMA*DIMA+DIMB*DIMB; ind += blockDim.x){
 84 |         unsigned r, c, indAdj; T *V;
 85 |         if (ind < DIMA*DIMA){
 86 |             indAdj = ind;
 87 |             r = indAdj % DIMA; c = indAdj/DIMA; V = A;
 88 |         }
 89 |         else {
 90 |             indAdj = ind - DIMA*DIMA;
 91 |             r = indAdj % DIMB; c = indAdj/DIMB; V = B;
 92 |         }
 93 |         V[indAdj] = static_cast<T>(r == c);
 94 |     }
 95 | }
 96 | 
 97 | 
 98 | // load identity in so memory is [V | I]
 99 | template <typename T>
100 | __device__ __forceinline__
101 | void loadIdentity(unsigned DIMA, unsigned DIMB, unsigned DIMC, T *A, T *B, T *C){
102 |     for (unsigned ind = threadIdx.x; ind < DIMA*DIMA+DIMB*DIMB+DIMC*DIMC; ind += blockDim.x){
103 |         unsigned r, c, indAdj; T *V;
104 |         if (ind < DIMA*DIMA){
105 |             indAdj = ind;
106 |             r = indAdj % DIMA; c = indAdj/DIMA; V = A;
107 |         }
108 |         else if (ind < DIMA*DIMA+DIMB*DIMB){
109 |             indAdj = ind - DIMA*DIMA;
110 |             r = indAdj % DIMB; c = indAdj/DIMB; V = B;
111 |         }
112 |         else{
113 |             indAdj = ind - DIMA*DIMA - DIMB*DIMB;
114 |             r = indAdj % DIMC; c = indAdj/DIMC; V = C;
115 |         }
116 |         V[indAdj] = static_cast<T>(r == c);
117 |     }
118 | }
119 | 
120 | template <typename T>
121 | __device__
122 | void invertMatrix(uint32_t DIM, T *A, T *s_temp){ 
123 | // we are going to guassian elimination walking down the matrix (assuming no leading 0s)
124 | // we therefore use the columns in order as the pivot column for each pivot we need to rescale 
125 | // that row so that the pivot value (pv) is 1 THEN for all other row values (orv) we need to add a multiple 
126 | // of the NEW pivot row value (prv) such that we transorm the other row pivot column value (orpcv) to 0
127 | // pr *= 1/pv   orv -= orpcv*prv == orv -= orpcv*1/pv*prvOld
128 |     for (unsigned pivRC = 0; pivRC < DIM; pivRC++){
129 |         unsigned pivColOffset = pivRC*DIM;
130 |         // save the pivot and pivot column and row
131 |         T pvInv = static_cast<T>(1)/A[pivRC + pivColOffset];
132 |         for (unsigned ind = threadIdx.x; ind < 2*DIM+1; ind++){
133 |             unsigned AInd;
134 |             if (ind < DIM){AInd = ind + pivColOffset;}
135 |             else{AInd = pivRC + pivColOffset + (ind-DIM)*DIM;}
136 |             s_temp[ind] = A[AInd];
137 |         }
138 |         __syncthreads(); //----------------------
139 |         // make the pivot update
140 |         for (unsigned ind = threadIdx.x; ind < DIM*(DIM+1); ind += blockDim.x){
141 |             unsigned row = ind % DIM; unsigned col = ind / DIM; unsigned colOffset = ind - row;
142 |             // s_temp = orpcvs|prvOld
143 |             if (row == pivRC){A[row + pivColOffset + colOffset] *= pvInv;}
144 |             else{A[row + pivColOffset + colOffset] -= s_temp[row]*pvInv*s_temp[DIM+col];}
145 |         }
146 |     __syncthreads(); //----------------------
147 |     }
148 | }
149 | 
150 | 
151 | template <typename T>
152 | __device__
153 | void invertMatrix(unsigned DIMA, unsigned DIMB, unsigned MAX_DIM, T *A, T *B, T *s_temp){
154 | 
155 |     // now we are going to guassian elimination walking down the matrix (assuming no leading 0s)
156 |     // we therefore use the columns in order as the pivot column for each pivot we need to rescale 
157 |     // that row so that the pivot value (pv) is 1 THEN for all other row values (orv) we need to add a multiple 
158 |     // of the NEW pivot row value (prv) such that we transorm the other row pivot column value (orpcv) to 0
159 |     // pr *= 1/pv   orv -= orpcv*prv == orv -= orpcv*1/pv*prvOld
160 |     T *s_memA = s_temp; T *s_memB = &s_memA[2*DIMA+1];
161 |     for (unsigned pivRC = 0; pivRC < MAX_DIM; pivRC++){
162 |         bool AActive = pivRC < DIMA; bool BActive = pivRC < DIMB;
163 |         unsigned pivColOffsetA = pivRC*DIMA; unsigned pivColOffsetB = pivRC*DIMB;
164 |         // save the pivot column and row
165 |         for (unsigned ind = threadIdx.x; ind < MAX_DIM; ind++){
166 |             if (AActive && ind < DIMA){s_memA[ind] = A[ind + pivColOffsetA];}
167 |             if (BActive && ind < DIMB){s_memB[ind] = B[ind + pivColOffsetB];}
168 |         }
169 |         for (unsigned ind = threadIdx.x; ind < MAX_DIM+1; ind++){
170 |             if (AActive && ind < DIMA+1){s_memA[ind + DIMA] = A[ind*DIMA + pivRC + pivColOffsetA];}
171 |             if (BActive && ind < DIMB+1){s_memB[ind + DIMB] = B[ind*DIMB + pivRC + pivColOffsetB];}
172 |         }
173 |         __syncthreads(); //----------------------
174 |         // make the pivot update with s_mem = [colA,rowA,colB,rowB,colC,rowC]
175 |         for (unsigned ind = threadIdx.x; ind < MAX_DIM*(MAX_DIM+1); ind += blockDim.x){
176 |             if (AActive && ind < DIMA*(DIMA+1)){
177 |                 unsigned row = ind % DIMA; unsigned col = ind / DIMA;
178 |                 if (row == pivRC){A[pivColOffsetA + ind] /= s_memA[pivRC];}
179 |                 else{A[pivColOffsetA + ind] -= s_memA[row]/s_memA[pivRC]*s_memA[DIMA+col];}
180 |             }
181 |             if (BActive && ind < DIMB*(DIMB+1)){
182 |                 unsigned row = ind % DIMB; unsigned col = ind / DIMB; 
183 |                 if (row == pivRC){B[pivColOffsetB + ind] /= s_memB[pivRC];}
184 |                 else{B[pivColOffsetB + ind] -= s_memB[row]/s_memB[pivRC]*s_memB[DIMB+col];}
185 |             }
186 |         }
187 |         __syncthreads(); //----------------------
188 |     }
189 | }
190 | 
191 | // invert A,B,C assume memory for all is [V | VInv] where both are DIMxDIM and continguous
192 | // relies on s_temp of size [2*DIMA + 2*DIMB + 2*DIMC + 3]
193 | template <typename T>
194 | __device__
195 | void invertMatrix(unsigned DIMA, unsigned DIMB, unsigned DIMC, unsigned MAX_DIM, T *A, T *B, T *C, T *s_temp){
196 | 
197 |     // now we are going to guassian elimination walking down the matrix (assuming no leading 0s)
198 |     // we therefore use the columns in order as the pivot column for each pivot we need to rescale 
199 |     // that row so that the pivot value (pv) is 1 THEN for all other row values (orv) we need to add a multiple 
200 |     // of the NEW pivot row value (prv) such that we transorm the other row pivot column value (orpcv) to 0
201 |     // pr *= 1/pv   orv -= orpcv*prv == orv -= orpcv*1/pv*prvOld
202 |     T *s_memA = s_temp; T *s_memB = &s_memA[2*DIMA+1]; T *s_memC = &s_memB[2*DIMB+1];
203 |     for (unsigned pivRC = 0; pivRC < MAX_DIM; pivRC++){
204 |         bool AActive = pivRC < DIMA; bool BActive = pivRC < DIMB; bool CActive = pivRC < DIMC;
205 |         unsigned pivColOffsetA = pivRC*DIMA; unsigned pivColOffsetB = pivRC*DIMB; unsigned pivColOffsetC = pivRC*DIMC;
206 |         // save the pivot column and row
207 |         for (unsigned ind = threadIdx.x; ind < MAX_DIM; ind++){
208 |             if (AActive && ind < DIMA){s_memA[ind] = A[ind + pivColOffsetA];}
209 |             if (BActive && ind < DIMB){s_memB[ind] = B[ind + pivColOffsetB];}
210 |             if (CActive && ind < DIMC){s_memC[ind] = C[ind + pivColOffsetC];}
211 |         }
212 |         for (unsigned ind = threadIdx.x; ind < MAX_DIM+1; ind++){
213 |             if (AActive && ind < DIMA+1){s_memA[ind + DIMA] = A[ind*DIMA + pivRC + pivColOffsetA];}
214 |             if (BActive && ind < DIMB+1){s_memB[ind + DIMB] = B[ind*DIMB + pivRC + pivColOffsetB];}
215 |             if (CActive && ind < DIMC+1){s_memC[ind + DIMC] = C[ind*DIMC + pivRC + pivColOffsetC];}
216 |         }
217 |         __syncthreads(); //----------------------
218 |         // make the pivot update with s_mem = [colA,rowA,colB,rowB,colC,rowC]
219 |         for (unsigned ind = threadIdx.x; ind < MAX_DIM*(MAX_DIM+1); ind += blockDim.x){
220 |             if (AActive && ind < DIMA*(DIMA+1)){
221 |                 unsigned row = ind % DIMA; unsigned col = ind / DIMA;
222 |                 if (row == pivRC){A[pivColOffsetA + ind] /= s_memA[pivRC];}
223 |                 else{A[pivColOffsetA + ind] -= s_memA[row]/s_memA[pivRC]*s_memA[DIMA+col];}
224 |             }
225 |             if (BActive && ind < DIMB*(DIMB+1)){
226 |                 unsigned row = ind % DIMB; unsigned col = ind / DIMB; 
227 |                 if (row == pivRC){B[pivColOffsetB + ind] /= s_memB[pivRC];}
228 |                 else{B[pivColOffsetB + ind] -= s_memB[row]/s_memB[pivRC]*s_memB[DIMB+col];}
229 |             }
230 |             if (CActive && ind < DIMC*(DIMC+1)){
231 |                 unsigned row = ind % DIMC; unsigned col = ind / DIMC;
232 |                 if (row == pivRC){C[pivColOffsetC + ind] /= s_memC[pivRC];}
233 |                 else{C[pivColOffsetC + ind] -= s_memC[row]/s_memC[pivRC]*s_memC[DIMC+col];}
234 |             }
235 |         }
236 |         __syncthreads(); //----------------------
237 |     }
238 | }
239 | 
240 | 
241 | void write_device_matrix_to_file(float* d_matrix, int rows, int cols, const char* filename, int filesuffix = 0) {
242 |     
243 |     char fname[100];
244 |     snprintf(fname, sizeof(fname), "%s%d.txt", filename, filesuffix);
245 |     
246 |     // Allocate host memory for the matrix
247 |     float* h_matrix = new float[rows * cols];
248 | 
249 |     // Copy the data from the device to the host memory
250 |     size_t pitch = cols * sizeof(float);
251 |     cudaMemcpy2D(h_matrix, pitch, d_matrix, pitch, pitch, rows, cudaMemcpyDeviceToHost);
252 | 
253 |     // Write the data to a file in column-major order
254 |     std::ofstream outfile(fname);
255 |     if (outfile.is_open()) {
256 |         for (int row = 0; row < rows; ++row) {
257 |             for (int col = 0; col < cols; ++col) {
258 |                 outfile << std::setprecision(std::numeric_limits<float>::max_digits10+1) << h_matrix[col * rows + row] << "\t";
259 |             }
260 |             outfile << std::endl;
261 |         }
262 |         outfile.close();
263 |     } else {
264 |         std::cerr << "Unable to open file: " << fname << std::endl;
265 |     }
266 | 
267 |     // Deallocate host memory
268 |     delete[] h_matrix;
269 | }


--------------------------------------------------------------------------------