├── .gitignore ├── .gitmodules ├── Makefile ├── README.md ├── examples ├── track_iiwa_pcg.cu ├── track_iiwa_qdldl.cu └── trajfiles │ ├── 0_0_eepos.traj │ ├── 0_0_traj.csv │ ├── 0_1_traj.csv │ ├── 0_2_traj.csv │ ├── 0_3_traj.csv │ ├── 0_4_traj.csv │ ├── 1_0_traj.csv │ ├── 1_2_traj.csv │ ├── 1_3_traj.csv │ ├── 1_4_traj.csv │ ├── 2_0_traj.csv │ ├── 2_1_traj.csv │ ├── 2_3_traj.csv │ ├── 2_4_traj.csv │ ├── 3_0_traj.csv │ ├── 3_1_traj.csv │ ├── 3_2_traj.csv │ ├── 3_4_traj.csv │ ├── 4_0_traj.csv │ ├── 4_1_traj.csv │ ├── 4_2_traj.csv │ └── 4_3_traj.csv └── include ├── common ├── dz.cuh ├── integrator.cuh ├── kkt.cuh ├── merit.cuh └── settings.cuh ├── dynamics ├── iiwa │ ├── iiwa_eepos_grid.cuh │ ├── iiwa_eepos_plant.cuh │ ├── iiwa_grid.cuh │ └── iiwa_plant.cuh └── rbd_plant.cuh ├── mpcsim.cuh ├── pcg ├── linsys_setup.cuh └── sqp.cuh ├── qdldl ├── linsys_setup.cuh └── sqp.cuh └── utils ├── csr.cuh ├── experiment.cuh └── matrix.cuh /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled source # 2 | ################### 3 | *.exe 4 | *.o 5 | *.so -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "GBD-PCG"] 2 | path = GBD-PCG 3 | url = git@github.com:A2R-Lab/GBD-PCG.git 4 | [submodule "qdldl"] 5 | path = qdldl 6 | url = https://github.com/osqp/qdldl.git 7 | [submodule "GLASS"] 8 | path = GLASS 9 | url = git@github.com:A2R-lab/GLASS.git 10 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile 2 | 3 | # Compiler and compiler flags 4 | NVCC = nvcc 5 | CFLAGS = --compiler-options -Wall -O3 -Iinclude -Iinclude/common -IGLASS -IGBD-PCG/include -lqdldl -Iqdldl/include -Lqdldl/build/out -lcublas 6 | 7 | 8 | examples: examples/pcg.exe examples/qdldl.exe 9 | 10 | examples/pcg.exe: 11 | $(NVCC) $(CFLAGS) examples/track_iiwa_pcg.cu -o examples/pcg.exe 12 | examples/qdldl.exe: 13 | $(NVCC) $(CFLAGS) -DLINSYS_SOLVE=0 examples/track_iiwa_qdldl.cu -o examples/qdldl.exe 14 | 15 | build_qdldl: 16 | cd qdldl && mkdir -p build && cd build && cmake -DQDLDL_FLOAT=true -DQDLDL_LONG=false .. && cmake --build . && cd ../../ 17 | 18 | clean: 19 | rm -f examples/*.exe 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MPCGPU 2 | 3 | Numerical experiments and the open-source solver from the paper ["MPCGPU: Real-Time Nonlinear Model Predictive Control through Preconditioned Conjugate Gradient on the GPU"](https://arxiv.org/abs/2309.08079) 4 | 5 | ### Building and running examples 6 | 7 | ``` 8 | git clone https://github.com/A2R-Lab/MPCGPU 9 | cd MPCGPU 10 | git submodule update --init --recursive 11 | make build_qdldl 12 | make examples 13 | mkdir -p tmp/results 14 | ``` 15 | Either install the qdldl shared library by running ```cd qdldl/build && make install``` or modify the ```LD_LIBRARY_PATH``` environment variable to include the path to ```MPCGPU/qdldl/build/out```. 16 | 17 | ``` 18 | ./examples/pcg.exe 19 | ./examples/qdldl.exe 20 | ``` 21 | 22 | ### Setting parameters 23 | 24 | You can set a bunch of parameters in `include/setting.cuh` file. You can also modify these by passing them as 25 | compiler flags. This will overwrite the default values set for these parameters. Please refer to `Makefile` for 26 | an example. 27 | 28 | ### Other solvers and problems 29 | 30 | You should be able to replace the underlying linear system solver with your own solver. Please refer to `include/linsys_solvers/qdldl/sqp.cuh` for an example. 31 | 32 | You should also be able to compile and run it for a different problem that "Kuka IIWA manipulator". Please refer to `include/dynamics/` folder for an example. We use [GRiD](!https://github.com/robot-acceleration/GRiD) for computing rigid body dynamics with analytical gradients. 33 | 34 | ### Citing 35 | To cite this work in your research, please use the following bibtex: 36 | ``` 37 | @inproceedings{adabag2024mpcgpu, 38 | title={MPCGPU: Real-Time Nonlinear Model Predictive Control through Preconditioned Conjugate Gradient on the GPU}, 39 | author={Emre Adabag and Miloni Atal and William Gerard and Brian Plancher}, 40 | booktitle={IEEE International Conference on Robotics and Automation (ICRA)}, 41 | address = {Yokohama, Japan}, 42 | month={May.}, 43 | year = {2024} 44 | } 45 | ``` 46 | -------------------------------------------------------------------------------- /examples/track_iiwa_pcg.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "mpcsim.cuh" 8 | #include "dynamics/rbd_plant.cuh" 9 | #include "settings.cuh" 10 | #include "utils/experiment.cuh" 11 | #include "gpu_pcg.cuh" 12 | 13 | 14 | int main(){ 15 | 16 | constexpr uint32_t state_size = grid::NUM_JOINTS*2; 17 | constexpr uint32_t control_size = grid::NUM_JOINTS; 18 | constexpr uint32_t knot_points = KNOT_POINTS; 19 | const linsys_t timestep = .015625; 20 | 21 | const uint32_t traj_test_iters = TEST_ITERS; 22 | 23 | // checks GPU space for pcg 24 | checkPcgOccupancy((void *) pcg, PCG_NUM_THREADS, state_size, knot_points); 25 | 26 | print_test_config(); 27 | // where to store test results — manually create this directory 28 | std::string output_directory_path = "tmp/results/"; 29 | 30 | const uint32_t recorded_states = 5; 31 | const uint32_t start_goal_combinations = recorded_states*recorded_states; 32 | 33 | char eePos_traj_file_name[100]; 34 | char xu_traj_file_name[100]; 35 | 36 | int start_state, goal_state; 37 | linsys_t *d_eePos_traj, *d_xu_traj, *d_xs; 38 | 39 | for(uint32_t ind = 0; ind < start_goal_combinations; ind++){ 40 | 41 | start_state = ind % recorded_states; 42 | goal_state = ind / recorded_states; 43 | if(start_state == goal_state && start_state != 0){ continue; } 44 | std::cout << "start: " << start_state << " goal: " << goal_state << std::endl; 45 | 46 | uint32_t num_exit_vals = 5; 47 | float pcg_exit_vals[num_exit_vals]; 48 | if(knot_points==32){ 49 | pcg_exit_vals[0] = 5e-6; 50 | pcg_exit_vals[1] = 7.5e-6; 51 | pcg_exit_vals[2] = 5e-6; 52 | pcg_exit_vals[3] = 2.5e-6; 53 | pcg_exit_vals[4] = 1e-6; 54 | } 55 | else if(knot_points==64){ 56 | pcg_exit_vals[0] = 5e-5; 57 | pcg_exit_vals[1] = 7.5e-5; 58 | pcg_exit_vals[2] = 5e-5; 59 | pcg_exit_vals[3] = 2.5e-5; 60 | pcg_exit_vals[4] = 1e-5; 61 | } 62 | else{ 63 | pcg_exit_vals[0] = 1e-5; 64 | pcg_exit_vals[1] = 5e-5; 65 | pcg_exit_vals[2] = 1e-4; 66 | pcg_exit_vals[3] = 5e-4; 67 | pcg_exit_vals[4] = 1e-3; 68 | } 69 | 70 | 71 | for (uint32_t pcg_exit_ind = 0; pcg_exit_ind < num_exit_vals; pcg_exit_ind++){ 72 | 73 | float pcg_exit_tol = pcg_exit_vals[pcg_exit_ind]; 74 | std::vector linsys_times; 75 | std::vector sqp_iters; 76 | std::vector current_results; 77 | std::vector tracking_errs; 78 | std::vector cur_tracking_errs; 79 | double tot_final_tracking_err = 0; 80 | 81 | std::string test_output_prefix = output_directory_path + std::to_string(KNOT_POINTS) + "_" + ( (LINSYS_SOLVE == 1) ? "PCG" : "QDLDL") + "_" + std::to_string(pcg_exit_tol); 82 | printf("Logging test results to files with prefix %s \n", test_output_prefix.c_str()); 83 | 84 | for (uint32_t single_traj_test_iter = 0; single_traj_test_iter < traj_test_iters; single_traj_test_iter++){ 85 | 86 | // read in traj 87 | snprintf(eePos_traj_file_name, sizeof(eePos_traj_file_name), "examples/trajfiles/%d_%d_eepos.traj", start_state, goal_state); 88 | std::vector> eePos_traj2d = readCSVToVecVec(eePos_traj_file_name); 89 | 90 | snprintf(xu_traj_file_name, sizeof(xu_traj_file_name), "examples/trajfiles/%d_%d_traj.csv", start_state, goal_state); 91 | std::vector> xu_traj2d = readCSVToVecVec(xu_traj_file_name); 92 | 93 | if(eePos_traj2d.size() < knot_points){std::cout << "precomputed traj length < knotpoints, not implemented\n"; continue; } 94 | 95 | 96 | std::vector h_eePos_traj; 97 | for (const auto& vec : eePos_traj2d) { 98 | h_eePos_traj.insert(h_eePos_traj.end(), vec.begin(), vec.end()); 99 | } 100 | std::vector h_xu_traj; 101 | for (const auto& xu_vec : xu_traj2d) { 102 | h_xu_traj.insert(h_xu_traj.end(), xu_vec.begin(), xu_vec.end()); 103 | } 104 | 105 | gpuErrchk(cudaMalloc(&d_eePos_traj, h_eePos_traj.size()*sizeof(linsys_t))); 106 | gpuErrchk(cudaMemcpy(d_eePos_traj, h_eePos_traj.data(), h_eePos_traj.size()*sizeof(linsys_t), cudaMemcpyHostToDevice)); 107 | 108 | gpuErrchk(cudaMalloc(&d_xu_traj, h_xu_traj.size()*sizeof(linsys_t))); 109 | gpuErrchk(cudaMemcpy(d_xu_traj, h_xu_traj.data(), h_xu_traj.size()*sizeof(linsys_t), cudaMemcpyHostToDevice)); 110 | 111 | gpuErrchk(cudaMalloc(&d_xs, state_size*sizeof(linsys_t))); 112 | gpuErrchk(cudaMemcpy(d_xs, h_xu_traj.data(), state_size*sizeof(linsys_t), cudaMemcpyHostToDevice)); 113 | 114 | std::tuple, std::vector, linsys_t> trackingstats = simulateMPC(state_size, control_size, knot_points, 115 | static_cast(eePos_traj2d.size()), timestep, d_eePos_traj, d_xu_traj, d_xs, start_state, goal_state, single_traj_test_iter, pcg_exit_tol, test_output_prefix); 116 | 117 | current_results = std::get<0>(trackingstats); 118 | if (TIME_LINSYS == 1) { 119 | linsys_times.insert(linsys_times.end(), current_results.begin(), current_results.end()); 120 | } else { 121 | sqp_iters.insert(sqp_iters.end(), current_results.begin(), current_results.end()); 122 | } 123 | 124 | cur_tracking_errs = std::get<1>(trackingstats); 125 | tracking_errs.insert(tracking_errs.end(), cur_tracking_errs.begin(), cur_tracking_errs.end()); 126 | 127 | tot_final_tracking_err += std::get<2>(trackingstats); 128 | 129 | 130 | 131 | gpuErrchk(cudaFree(d_xu_traj)); 132 | gpuErrchk(cudaFree(d_eePos_traj)); 133 | gpuErrchk(cudaFree(d_xs)); 134 | gpuErrchk(cudaPeekAtLastError()); 135 | 136 | } 137 | 138 | std::cout << "Completed at " << getCurrentTimestamp() << std::endl; 139 | std::cout << "\nRESULTS*************************************\n"; 140 | std::cout << "Exit tol: " << pcg_exit_tol << std::endl; 141 | std::cout << "\nTracking err"; 142 | std::string trackingStats = printStats(&tracking_errs, "trackingerr"); 143 | std::cout << "Average final tracking err: " << tot_final_tracking_err / traj_test_iters << std::endl; 144 | std::string linsysOrSqpStats; 145 | if (TIME_LINSYS == 1) 146 | { 147 | std::cout << "\nLinsys times"; 148 | linsysOrSqpStats = printStats(&linsys_times, "linsystimes"); 149 | } 150 | else 151 | { 152 | std::cout << "\nSqp iters"; 153 | linsysOrSqpStats = printStats(&sqp_iters, "sqpiters"); 154 | } 155 | std::cout << "************************************************\n\n"; 156 | 157 | // Specify the CSV file path 158 | const std::string csvFilePath = test_output_prefix + "_" + "overall_stats.csv"; 159 | 160 | // Open the CSV file for writing 161 | std::ofstream csvFile(csvFilePath); 162 | if (!csvFile.is_open()) { 163 | std::cerr << "Error opening CSV file for writing." << std::endl; 164 | return 1; 165 | } 166 | 167 | // Write the header row 168 | csvFile << "Average,Std Dev, Min, Max, Median, Q1, Q3\n"; 169 | 170 | // Write the data rows 171 | csvFile << getStatsString(trackingStats) << "\n"; 172 | csvFile << getStatsString(linsysOrSqpStats) << "\n"; 173 | 174 | // Close the CSV file 175 | csvFile.close(); 176 | } 177 | break; 178 | } 179 | 180 | 181 | 182 | 183 | return 0; 184 | } 185 | -------------------------------------------------------------------------------- /examples/track_iiwa_qdldl.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "qdldl.h" 8 | #include "mpcsim.cuh" 9 | #include "dynamics/rbd_plant.cuh" 10 | #include "settings.cuh" 11 | #include "utils/experiment.cuh" 12 | 13 | int main(){ 14 | 15 | constexpr uint32_t state_size = grid::NUM_JOINTS*2; 16 | constexpr uint32_t control_size = grid::NUM_JOINTS; 17 | constexpr uint32_t knot_points = KNOT_POINTS; 18 | const linsys_t timestep = .015625; 19 | 20 | const uint32_t traj_test_iters = TEST_ITERS; 21 | 22 | if(!std::is_same::value){ std::cout << "GBD-PCG QDLDL type mismatch" << std::endl; exit(1); } 23 | 24 | print_test_config(); 25 | // where to store test results — manually create this directory 26 | std::string output_directory_path = "tmp/results/"; 27 | 28 | const uint32_t recorded_states = 5; 29 | const uint32_t start_goal_combinations = recorded_states*recorded_states; 30 | 31 | char eePos_traj_file_name[100]; 32 | char xu_traj_file_name[100]; 33 | 34 | int start_state, goal_state; 35 | linsys_t *d_eePos_traj, *d_xu_traj, *d_xs; 36 | 37 | for(uint32_t ind = 0; ind < start_goal_combinations; ind++){ 38 | 39 | start_state = ind % recorded_states; 40 | goal_state = ind / recorded_states; 41 | if(start_state == goal_state && start_state != 0){ continue; } 42 | std::cout << "start: " << start_state << " goal: " << goal_state << std::endl; 43 | 44 | float linsys_exit_tol = -1; 45 | std::vector linsys_times; 46 | std::vector sqp_iters; 47 | std::vector current_results; 48 | std::vector tracking_errs; 49 | std::vector cur_tracking_errs; 50 | double tot_final_tracking_err = 0; 51 | 52 | std::string test_output_prefix = output_directory_path + std::to_string(KNOT_POINTS) + "_" + ( (LINSYS_SOLVE == 1) ? "PCG" : "QDLDL"); 53 | printf("Logging test results to files with prefix %s \n", test_output_prefix.c_str()); 54 | 55 | for (uint32_t single_traj_test_iter = 0; single_traj_test_iter < traj_test_iters; single_traj_test_iter++){ 56 | 57 | // read in traj 58 | snprintf(eePos_traj_file_name, sizeof(eePos_traj_file_name), "examples/trajfiles/%d_%d_eepos.traj", start_state, goal_state); 59 | std::vector> eePos_traj2d = readCSVToVecVec(eePos_traj_file_name); 60 | 61 | snprintf(xu_traj_file_name, sizeof(xu_traj_file_name), "examples/trajfiles/%d_%d_traj.csv", start_state, goal_state); 62 | std::vector> xu_traj2d = readCSVToVecVec(xu_traj_file_name); 63 | 64 | if(eePos_traj2d.size() < knot_points){std::cout << "precomputed traj length < knotpoints, not implemented\n"; continue; } 65 | 66 | 67 | std::vector h_eePos_traj; 68 | for (const auto& vec : eePos_traj2d) { 69 | h_eePos_traj.insert(h_eePos_traj.end(), vec.begin(), vec.end()); 70 | } 71 | std::vector h_xu_traj; 72 | for (const auto& xu_vec : xu_traj2d) { 73 | h_xu_traj.insert(h_xu_traj.end(), xu_vec.begin(), xu_vec.end()); 74 | } 75 | 76 | gpuErrchk(cudaMalloc(&d_eePos_traj, h_eePos_traj.size()*sizeof(linsys_t))); 77 | gpuErrchk(cudaMemcpy(d_eePos_traj, h_eePos_traj.data(), h_eePos_traj.size()*sizeof(linsys_t), cudaMemcpyHostToDevice)); 78 | 79 | gpuErrchk(cudaMalloc(&d_xu_traj, h_xu_traj.size()*sizeof(linsys_t))); 80 | gpuErrchk(cudaMemcpy(d_xu_traj, h_xu_traj.data(), h_xu_traj.size()*sizeof(linsys_t), cudaMemcpyHostToDevice)); 81 | 82 | gpuErrchk(cudaMalloc(&d_xs, state_size*sizeof(linsys_t))); 83 | gpuErrchk(cudaMemcpy(d_xs, h_xu_traj.data(), state_size*sizeof(linsys_t), cudaMemcpyHostToDevice)); 84 | 85 | std::tuple, std::vector, linsys_t> trackingstats = simulateMPC(state_size, control_size, knot_points, 86 | static_cast(eePos_traj2d.size()), timestep, d_eePos_traj, d_xu_traj, d_xs, start_state, goal_state, single_traj_test_iter, linsys_exit_tol, test_output_prefix); 87 | 88 | current_results = std::get<0>(trackingstats); 89 | if (TIME_LINSYS == 1) { 90 | linsys_times.insert(linsys_times.end(), current_results.begin(), current_results.end()); 91 | } else { 92 | sqp_iters.insert(sqp_iters.end(), current_results.begin(), current_results.end()); 93 | } 94 | 95 | cur_tracking_errs = std::get<1>(trackingstats); 96 | tracking_errs.insert(tracking_errs.end(), cur_tracking_errs.begin(), cur_tracking_errs.end()); 97 | 98 | tot_final_tracking_err += std::get<2>(trackingstats); 99 | 100 | 101 | 102 | gpuErrchk(cudaFree(d_xu_traj)); 103 | gpuErrchk(cudaFree(d_eePos_traj)); 104 | gpuErrchk(cudaFree(d_xs)); 105 | gpuErrchk(cudaPeekAtLastError()); 106 | 107 | } 108 | 109 | std::cout << "Completed at " << getCurrentTimestamp() << std::endl; 110 | std::cout << "\nRESULTS*************************************\n"; 111 | std::cout << "exit tol: " << linsys_exit_tol << std::endl; 112 | std::cout << "\nTracking err"; 113 | std::string trackingStats = printStats(&tracking_errs, "trackingerr"); 114 | std::cout << "Average final tracking err: " << tot_final_tracking_err / traj_test_iters << std::endl; 115 | std::string linsysOrSqpStats; 116 | if (TIME_LINSYS == 1) 117 | { 118 | std::cout << "\nLinsys times"; 119 | linsysOrSqpStats = printStats(&linsys_times, "linsystimes"); 120 | } 121 | else 122 | { 123 | std::cout << "\nSqp iters"; 124 | linsysOrSqpStats = printStats(&sqp_iters, "sqpiters"); 125 | } 126 | std::cout << "************************************************\n\n"; 127 | 128 | 129 | // Specify the CSV file path 130 | const std::string csvFilePath = test_output_prefix + "_" + "overall_stats.csv"; 131 | 132 | // Open the CSV file for writing 133 | std::ofstream csvFile(csvFilePath); 134 | if (!csvFile.is_open()) { 135 | std::cerr << "Error opening CSV file for writing." << std::endl; 136 | return 1; 137 | } 138 | 139 | // Write the header row 140 | csvFile << "Average,Std Dev, Min, Max, Median, Q1, Q3\n"; 141 | 142 | // Write the data rows 143 | csvFile << getStatsString(trackingStats) << "\n"; 144 | csvFile << getStatsString(linsysOrSqpStats) << "\n"; 145 | 146 | // Close the CSV file 147 | csvFile.close(); 148 | 149 | break; 150 | } 151 | 152 | 153 | 154 | 155 | return 0; 156 | } 157 | -------------------------------------------------------------------------------- /examples/trajfiles/3_4_traj.csv: -------------------------------------------------------------------------------- 1 | 0.2099999934,0.4600000083,-0.8799999952,-0.4099999964,0.2099999934,0.8700000048,1.080000043,0.009999999776,0.01999999955,-0.02999999933,0.03999999911,-0.05000000075,0,0.009999999776,0,0,0,0,0,0,0 2 | 0.2099999934,0.4600000083,-0.8799999952,-0.4099999964,0.2099999934,0.8700000048,1.080000043,0.009999999776,0.01999999955,-0.02999999933,0.03999999911,-0.05000000075,0,0.009999999776,9.991406441,7.832764149,0.5569088459,-25.84409904,-9.808384895,0.01617191359,-1.959708691 3 | 0.210156247,0.4603125155,-0.8804687262,-0.4093749821,0.2092187405,0.8700000048,1.080156326,0.9763528109,-0.6563335657,1.896548152,-2.231715202,-9.520463943,-3.523556709,-1.300929427,4.204999924,10.6261673,3.42241478,-15.94774055,1.098550439,1.951040983,-0.02675159648 4 | 0.2254117578,0.4500572979,-0.8508351445,-0.4442455173,0.06046149135,0.8149444461,1.059829354,1.386060119,-0.8999018669,1.654242396,-3.17975831,-9.153206825,-3.290498018,-1.271032333,1.077785254,12.104496,3.916874886,-9.365999222,1.283721089,0.963589251,-0.04367618263 5 | 0.2470689416,0.4359963238,-0.8249875903,-0.4939292371,-0.0825573653,0.7635304332,1.039969444,1.397001386,-1.004281044,1.643015862,-3.515737534,-8.717803001,-3.069464445,-1.207003117,0.1252228469,12.27435493,3.402727604,-6.210497379,1.053107738,0.4171015024,-0.05363409594 6 | 0.2688970864,0.4203044176,-0.7993154526,-0.5488626361,-0.2187730372,0.7155700326,1.021110058,1.28299427,-1.029776931,1.585389137,-3.600118637,-8.291081429,-2.893771172,-1.14661479,-0.04410818592,11.87872505,3.018870592,-4.578218937,0.8713625073,0.1302656382,-0.04291247576 7 | 0.2889438868,0.4042141438,-0.7745437622,-0.60511446,-0.3483211994,0.6703548431,1.003194213,1.128157377,-1.013815522,1.502522945,-3.573584557,-7.887279034,-2.739961624,-1.090176344,0.1007819548,11.25065517,2.73128438,-3.638435125,0.7391657233,-0.02230995893,-0.02418729849 8 | 0.3065713346,0.3883732855,-0.7510668635,-0.6609517336,-0.4715599418,0.6275429726,0.9861602187,0.9674246311,-0.9751346707,1.410684347,-3.497129202,-7.504574776,-2.599572182,-1.036983609,0.3587346971,10.5305624,2.498670101,-3.020742893,0.6368814707,-0.1016343087,-0.004605192691 9 | 0.3216873407,0.3731368184,-0.7290249467,-0.7155943513,-0.5888189077,0.5869246721,0.9699573517,0.8156070709,-0.9234204292,1.317762017,-3.397771358,-7.140995979,-2.468681097,-0.9866023064,0.6383093596,9.782183647,2.301772594,-2.558377743,0.5532415509,-0.1400595605,0.01300768554 10 | 0.3344312012,0.3587083817,-0.7084349394,-0.7686845064,-0.700396955,0.5483515263,0.9545416832,0.6786612272,-0.8640217781,1.227555513,-3.28790617,-6.795248032,-2.345368624,-0.9387907982,0.897870481,9.034562111,2.131248474,-2.174699783,0.4826232493,-0.1553330123,0.02781313099 11 | 0.3450352848,0.3452080488,-0.6892544031,-0.8200580478,-0.8065726757,0.5117051601,0.9398730993,0.5584961772,-0.8001363277,1.14192915,-3.173535109,-6.46626091,-2.228666544,-0.8933808208,1.118873596,8.301286697,1.981235147,-1.833651662,0.4219322503,-0.1573725939,0.03973370418 12 | 0.3537617922,0.332705915,-0.6714117527,-0.8696445227,-0.9076080322,0.4768822491,0.9259140491,0.4550662041,-0.7338258028,1.061833858,-3.057766914,-6.153164387,-2.117984772,-0.850231111,1.29460752,7.589044094,1.847171783,-1.518584013,0.369430244,-0.1518730223,0.04900585487 13 | 0.3608722091,0.3212398887,-0.654820621,-0.9174221158,-1.003751278,0.4437887371,0.9126291871,0.3673797846,-0.6665077209,0.9877112508,-2.942373753,-5.855163574,-2.012966633,-0.8092082739,1.424676538,6.900812626,1.724993825,-1.22185421,0.3238896728,-0.1419952214,0.05593191087 14 | 0.3666125238,0.3108257055,-0.6393876076,-0.963396728,-1.095238209,0.4123361409,0.8999853134,0.2940093875,-0.5992016792,0.919688642,-2.828468561,-5.571601391,-1.913300157,-0.7702032328,1.511806488,6.238016605,1.611262918,-0.9400814772,0.2845953703,-0.1296046227,0.06088227779 15 | 0.3712064326,0.3014631867,-0.6250174642,-1.007591605,-1.182294488,0.3824408352,0.8879508972,0.2333576381,-0.532656312,0.8576809168,-2.716795683,-5.301782608,-1.818749189,-0.733101368,1.560486078,5.601210117,1.503153563,-0.6723706126,0.2509788275,-0.1156345308,0.06419639289 16 | 0.3748526573,0.2931404412,-0.6116161942,-1.050041556,-1.265134811,0.3540228903,0.8764961958,0.1838198304,-0.4674308896,0.8014618158,-2.607881069,-5.04499054,-1.729033947,-0.6978045106,1.575835466,4.990192413,1.39840889,-0.4183811843,0.2222726792,-0.1007424593,0.06611167639 17 | 0.3777248561,0.2858368456,-0.5990933776,-1.090789676,-1.343962789,0.3270067275,0.8655930161,0.143859297,-0.4039468169,0.7507185936,-2.502102375,-4.80072403,-1.64397037,-0.6642226577,1.562997341,4.40459156,1.295410156,-0.1784701943,0.198166877,-0.085193187,0.06693752855 18 | 0.3799726665,0.2795251906,-0.5873634219,-1.129885077,-1.418974161,0.3013196886,0.8552145362,0.1120715067,-0.3425167203,0.705065012,-2.399713516,-4.568335056,-1.563319683,-0.6322647929,1.526815414,3.843975544,1.192992926,0.04716718569,0.1780515462,-0.06919217855,0.06686349213 19 | 0.3817237914,0.2741733789,-0.576346755,-1.167380571,-1.490354419,0.2768928111,0.8453354239,0.08719525486,-0.2833710611,0.6640833616,-2.300871611,-4.34730196,-1.48684597,-0.6018499732,1.471623421,3.307807446,1.090525031,0.2584060729,0.1615772843,-0.0529362224,0.06611222029 20 | 0.3830862045,0.2697457075,-0.5659704804,-1.203331709,-1.558281064,0.2536608577,0.8359315395,0.06811897457,-0.2266747952,0.6273411512,-2.205647469,-4.137019157,-1.414343953,-0.5728984475,1.40141201,2.795626402,0.9878099561,0.4547972977,0.148204416,-0.03656005114,0.06481542438 21 | 0.3841505647,0.2662039101,-0.5561682582,-1.237794995,-1.622921944,0.2315617353,0.8269799948,0.05387430638,-0.1725497097,0.5944211483,-2.114062786,-3.936994553,-1.345598578,-0.5453414917,1.319521785,2.306937933,0.8847957253,0.6364368796,0.1375202388,-0.02020245604,0.0631146729 22 | 0.3849923611,0.2635078132,-0.546880424,-1.270827174,-1.684437513,0.2105367631,0.818459034,0.04363106191,-0.1210781634,0.5649145246,-2.026085615,-3.746736765,-1.280396581,-0.5191071033,1.228804231,1.8413167,0.7817070484,0.80339396,0.1291450709,-0.004029831383,0.06112636253 23 | 0.3856740892,0.2616159618,-0.5380536318,-1.302484751,-1.742980242,0.1905305684,0.8103479743,0.03667804599,-0.07231400907,0.5384389162,-1.941648245,-3.565755606,-1.218546629,-0.4941310883,1.131683707,1.398428082,0.6788892746,0.9559422135,0.1227005869,0.01181147899,0.05894094706 24 | 0.3862471879,0.2604860663,-0.5296404958,-1.332823038,-1.798695207,0.1714907736,0.8026272058,0.03240851313,-0.02628783137,0.5146428347,-1.860655069,-3.393583775,-1.159866929,-0.4703538716,1.030238032,0.9780305624,0.5767514706,1.094190121,0.1177949458,0.02719751,0.05661784485 25 | 0.3867535591,0.2600753307,-0.5215991735,-1.3618958,-1.851719975,0.1533678472,0.7952779531,0.03031272069,0.01698634401,0.4932011068,-1.783000231,-3.229820967,-1.104172468,-0.4477190077,0.9260895252,0.5797529817,0.4757205248,1.218822241,0.1142327189,0.04200038686,0.0542502813 26 | 0.3872272074,0.2603407502,-0.513892889,-1.38975513,-1.902185917,0.1361151487,0.7882823348,0.02995982207,0.05751252547,0.4738182127,-1.708558321,-3.073982,-1.051291943,-0.4261698723,0.8207266331,0.2034440339,0.3763141036,1.330058098,0.1115580201,0.05609277263,0.05183074623 27 | 0.3876953423,0.2612393796,-0.5064894557,-1.416451335,-1.950216889,0.1196887121,0.7816234231,0.03098231927,0.09530344605,0.4562348425,-1.637208343,-2.925774097,-1.001076579,-0.4056552947,0.7152611613,-0.1510895938,0.2789497674,1.428749561,0.1097126305,0.06940529495,0.04944870621 28 | 0.3881794512,0.2627284825,-0.4993607998,-1.442032695,-1.995932102,0.1040468886,0.7752850652,0.03307596222,0.1303879917,0.4402119517,-1.568817377,-2.784753799,-0.9533701539,-0.3861263692,0.6106904149,-0.4841172099,0.1840743423,1.515300989,0.1084182039,0.08185072988,0.04710962996 29 | 0.3886962533,0.264765799,-0.4924824834,-1.466545463,-2.03944397,0.08915048093,0.7692518234,0.03598498181,0.1628020108,0.4255379736,-1.503260851,-2.650568724,-0.9080362916,-0.3675367534,0.5078269243,-0.7958704233,0.09210560471,1.590422392,0.1074717715,0.09338294715,0.04482619092 30 | 0.3892585039,0.2673095763,-0.4858334661,-1.490033865,-2.080859184,0.0749624148,0.7635090351,0.03949214891,0.1925904453,0.4120289683,-1.440416217,-2.522915602,-0.8649411201,-0.349837333,0.4072561562,-1.086704135,0.003345089033,1.654977322,0.1068040133,0.1039702445,0.04263185337 31 | 0.389875561,0.2703188062,-0.4793955088,-1.51254034,-2.120279789,0.0614477098,0.7580428123,0.04341764003,0.2198082209,0.3995184004,-1.380160332,-2.401439667,-0.823964119,-0.332991004,0.3095520735,-1.356871963,-0.08186154068,1.709528565,0.106266737,0.1135983095,0.04052871838 32 | 0.3905539513,0.2737533152,-0.4731530249,-1.534105301,-2.157802343,0.04857327044,0.7528398037,0.04761113971,0.2445165366,0.3878610432,-1.322379231,-2.285836458,-0.7849846482,-0.3169545829,0.2151671946,-1.606685758,-0.1632221192,1.754729033,0.1057344228,0.1222458705,0.03851189464 33 | 0.3912978768,0.2775738835,-0.4670926929,-1.554767489,-2.193518639,0.03630788624,0.747887373,0.05194659531,0.2667831481,0.3769296408,-1.266964555,-2.175850153,-0.7479057908,-0.3016900122,0.1244329885,-1.836529255,-0.2405114174,1.791366577,0.1052514985,0.1299544573,0.03661888093 34 | 0.3921095431,0.2817423642,-0.4612031579,-1.574563861,-2.227516413,0.02462185919,0.74317348,0.05632156879,0.2866835296,0.3666127026,-1.21381247,-2.071133137,-0.7126061916,-0.2871604562,0.0376669839,-2.046811581,-0.3135189116,1.819991589,0.1046021059,0.1367045343,0.03480178118 35 | 0.3929895759,0.2862218022,-0.4554748237,-1.593529701,-2.25987792,0.01348738745,0.7386866212,0.06064805388,0.304296881,0.3568146229,-1.162825584,-1.971506596,-0.6790114045,-0.2733300924,-0.04491040483,-2.237961054,-0.3820841908,1.841253757,0.1038670093,0.142543748,0.03309428319 36 | 0.3939372003,0.2909764349,-0.4498995841,-1.611698866,-2.290682793,0.002877834253,0.7344158292,0.06485706568,0.3197085857,0.3474510908,-1.113912582,-1.876693487,-0.6470288634,-0.2601676881,-0.1231659129,-2.410532713,-0.4461358488,1.855897069,0.103025943,0.147525996,0.03149637952 37 | 0.3949505985,0.2959718704,-0.4444706738,-1.62910378,-2.320006132,-0.007231991738,0.7303507328,0.06889312714,0.3330090046,0.3384487033,-1.06698513,-1.786428213,-0.6165630817,-0.247637406,-0.196956858,-2.565092087,-0.505572319,1.864428282,0.1020112336,0.1516682506,0.02998733148 38 | 0.3960270584,0.3011751473,-0.4391824007,-1.645775437,-2.347918987,-0.01686578989,0.7264813781,0.07270992547,0.3442910612,0.3297465444,-1.021960497,-1.700507045,-0.5875444412,-0.2357103527,-0.2662021816,-2.702188492,-0.5603578091,1.867390394,0.1008147448,0.1550169736,0.02856246382 39 | 0.3971631527,0.3065547049,-0.4340301156,-1.661743522,-2.374489307,-0.02604617178,0.7227984071,0.07627151161,0.353651017,0.3212923706,-0.9787601829,-1.618739486,-0.5599068999,-0.2243593484,-0.3308574855,-2.822480202,-0.6104848981,1.865313768,0.09948141873,0.1576333344,0.02723096125 40 | 0.398354888,0.3120805025,-0.4290099144,-1.677036643,-2.399782181,-0.03479471803,0.7192928195,0.07955127954,0.3611875176,0.3130425215,-0.9373106956,-1.540904403,-0.5335736871,-0.2135564536,-0.390930146,-2.926653147,-0.6560022831,1.858740449,0.09799384326,0.1595636308,0.02598120831 41 | 0.3995978832,0.3177240491,-0.424118638,-1.6916821,-2.423858881,-0.04313180596,0.7159559727,0.08252950013,0.3670010865,0.3049601912,-0.8975406289,-1.46681416,-0.5084808469,-0.203272596,-0.446426183,-3.015343428,-0.6969528198,1.848058105,0.09636003524,0.1608557403,0.02480675653 42 | 0.4008873999,0.3234584332,-0.4193536341,-1.70570612,-2.446777821,-0.05107681826,0.71277982,0.08519287407,0.3711932003,0.2970153093,-0.8593831658,-1.396288991,-0.4845659733,-0.1934860945,-0.4973852336,-3.089286089,-0.7334092855,1.833696604,0.09459446371,0.1615519226,0.02370432205 43 | 0.4022185504,0.3292583227,-0.4147127569,-1.719133973,-2.46859479,-0.05864816159,0.7097566128,0.08753298223,0.373865664,0.2891843021,-0.8227740526,-1.329158068,-0.4617771506,-0.1841711998,-0.5439296365,-3.149354696,-0.7655391097,1.816161156,0.09272176772,0.1617219299,0.02267135307 44 | 0.4035862386,0.3350999653,-0.410194248,-1.731989861,-2.489362955,-0.0658634305,0.7068789601,0.08954655379,0.3751199841,0.2814477384,-0.7876508236,-1.265249133,-0.4400544763,-0.1753051877,-0.5860841274,-3.196105003,-0.7934034467,1.795710206,0.09073790163,0.1613986343,0.02169793844 45 | 0.4049853981,0.3409612179,-0.4057966173,-1.744296908,-2.509132385,-0.07273928076,0.7041398287,0.09123361856,0.3750574291,0.2737919688,-0.7539542317,-1.204413652,-0.4193477631,-0.1668665409,-0.6239601374,-3.230334282,-0.8171652555,1.77268827,0.08867237717,0.160630241,0.02078399248 46 | 0.4064109325,0.3468214869,-0.4015186131,-1.756077409,-2.527951241,-0.07929158956,0.7015325427,0.09259895235,0.3737780154,0.2662054598,-0.7216275334,-1.14649725,-0.3996084034,-0.1588333845,-0.6576936841,-3.252872944,-0.8369845152,1.747371078,0.08651776612,0.1594576091,0.01991752908 47 | 0.4078578055,0.3526617587,-0.3973591626,-1.767352819,-2.545865297,-0.08553547412,0.6990507841,0.09364801645,0.3713790774,0.2586829066,-0.6906166673,-1.091372728,-0.3807921708,-0.1511891037,-0.6873921156,-3.264338493,-0.8530156016,1.7200526,0.08432210237,0.1579260975,0.0191057194 48 | 0.4093210697,0.3584645689,-0.3933172524,-1.778143764,-2.562917948,-0.09148535132,0.6966884732,0.09439046681,0.3679571152,0.2512199581,-0.6608693004,-1.03888905,-0.3628526032,-0.1439122111,-0.7132634521,-3.265720606,-0.8655019999,1.691038251,0.08206364512,0.1560740918,0.01833258756 49 | 0.410795927,0.3642138839,-0.3893919289,-1.788469791,-2.579150677,-0.09715492278,0.6944398284,0.09483641386,0.3636045456,0.2438155264,-0.6323341131,-0.9889353514,-0.3457535505,-0.1369870156,-0.7354079485,-3.257618666,-0.8745895028,1.660505891,0.07978202403,0.1539425999,0.01760373265 50 | 0.4122777581,0.3698952198,-0.385582298,-1.798349977,-2.594602823,-0.1025573239,0.6922994256,0.09499890357,0.3584116995,0.2364700139,-0.6049630642,-0.9413807392,-0.3294505775,-0.1303950399,-0.7540143132,-3.240794659,-0.8804967999,1.628680706,0.07749184966,0.1515660137,0.01691614091 51 | 0.4137621224,0.375495404,-0.3818874657,-1.807802558,-2.609311819,-0.1077049896,0.6902620196,0.09489215165,0.3524655104,0.2291852981,-0.5787097216,-0.8960977793,-0.3139026165,-0.1241199672,-0.7692053914,-3.215848207,-0.8833800554,1.595705152,0.07516720891,0.1489609182,0.01625573821 52 | 0.4152448177,0.3810026646,-0.3783064485,-1.81684494,-2.623313427,-0.1126097143,0.6883226633,0.09453035146,0.3458498716,0.2219657749,-0.5535284877,-0.8529978991,-0.2990829647,-0.1181475073,-0.7811929584,-3.183618069,-0.8834796548,1.561792135,0.07284149528,0.1461701095,0.01562676579 53 | 0.4167218506,0.3864065707,-0.374838233,-1.825493813,-2.636641502,-0.1172828823,0.6864765882,0.09392879158,0.3386443257,0.2148166001,-0.5293762684,-0.811976552,-0.284955889,-0.1124631241,-0.790168345,-3.144799471,-0.8810173273,1.527165771,0.07055652142,0.1432292461,0.0150366202 54 | 0.4181894958,0.3916978836,-0.3714817166,-1.833765268,-2.649328709,-0.1217353195,0.6847193241,0.09310439974,0.3309248686,0.2077422142,-0.5062111616,-0.7729048133,-0.2714823484,-0.1070507988,-0.7962448001,-3.099918842,-0.8761274815,1.491796136,0.06823741645,0.1401369125,0.01445809659 55 | 0.4196442664,0.3968685865,-0.3682357371,-1.841674805,-2.661405325,-0.1259772331,0.683046639,0.0920720771,0.3227631748,0.2007507682,-0.4839936495,-0.7357316017,-0.2586413324,-0.1019009575,-0.7996664047,-3.049725294,-0.8690789938,1.456003904,0.06598052382,0.1369453669,0.01391632762 56 | 0.4210828841,0.4019117653,-0.3650990129,-1.849237204,-2.672901154,-0.1300185025,0.6814544201,0.09084951133,0.3142274022,0.1938471049,-0.4626848698,-0.7003312707,-0.2463947833,-0.09699727595,-0.8005658984,-2.994710684,-0.8600105643,1.41974771,0.06372748315,0.1336484998,0.01338967122 57 | 0.4225023985,0.4068215787,-0.3620701432,-1.856466651,-2.683843851,-0.1338684261,0.6799388528,0.08945222944,0.3053812683,0.1870390773,-0.4422482848,-0.6666388512,-0.234723106,-0.09233058244,-0.7991551161,-2.935563326,-0.849153161,1.383213282,0.06151872501,0.1302839965,0.01288756449 58 | 0.4239000976,0.411593169,-0.3591476679,-1.863376737,-2.69426012,-0.1375359744,0.678496182,0.08789675683,0.296284169,0.180333063,-0.4226492345,-0.634562254,-0.2235943377,-0.08788838983,-0.7955752611,-2.872694731,-0.8366603851,1.346513033,0.05934936553,0.1268644035,0.01240567211 59 | 0.425273478,0.4162226021,-0.3563299775,-1.869980574,-2.704175234,-0.141029641,0.6771229506,0.08619936556,0.2869923413,0.1737352312,-0.403853178,-0.6040218472,-0.2129831165,-0.08365878463,-0.7899918556,-2.806655169,-0.8226977587,1.309664249,0.05720510706,0.1233965158,0.01193705201 60 | 0.4266203344,0.4207068682,-0.3536153734,-1.876290798,-2.713613033,-0.1443575025,0.6758157611,0.08437518775,0.2775573134,0.1672526747,-0.3858281374,-0.5749567747,-0.2028692812,-0.07963341475,-0.7825935483,-2.7379601,-0.8074572682,1.272821307,0.0551183708,0.1199057177,0.01148956735 61 | 0.4279386997,0.4250437021,-0.3510020375,-1.882319331,-2.722596645,-0.1475273371,0.6745715141,0.08243972808,0.268026948,0.1608909667,-0.368543148,-0.5472840667,-0.1932267696,-0.07580138743,-0.7735576034,-2.667135,-0.7911096215,1.236053228,0.05307078734,0.116399467,0.01105566602 62 | 0.4292268157,0.4292316139,-0.3484881222,-1.888077855,-2.731148005,-0.150546506,0.6733871102,0.08040711284,0.2584446073,0.1546560228,-0.3519683182,-0.5209433436,-0.1840365678,-0.07215411961,-0.7630058527,-2.594482422,-0.7737811208,1.199434876,0.0510825254,0.112896204,0.01064046752 63 | 0.4304831624,0.4332697988,-0.3460716307,-1.893577337,-2.739287853,-0.1534220725,0.6722596884,0.07829188555,0.2488508672,0.148552537,-0.3360753357,-0.4958604872,-0.1752726436,-0.06868086755,-0.7510743141,-2.520379066,-0.7555931807,1.162952781,0.04912081361,0.1093886942,0.0102323601 64 | 0.4317064583,0.4371581078,-0.3437505066,-1.898828506,-2.747035742,-0.1561607122,0.6711865664,0.07610679418,0.2392823547,0.1425861269,-0.3208371997,-0.4719943404,-0.1669227779,-0.0653757453,-0.7379370332,-2.445242167,-0.7367148399,1.126767397,0.04723277315,0.1059076786,0.009844734333 65 | 0.4328956306,0.4408968985,-0.3415226042,-1.903841615,-2.754410744,-0.1587688774,0.670165062,0.07386527956,0.2297725379,0.1367601305,-0.3062281609,-0.4492661357,-0.1589601487,-0.06222908944,-0.7236613631,-2.369223833,-0.7172117829,1.090846896,0.0453822799,0.1024434716,0.009466071613 66 | 0.4340497851,0.4444870949,-0.3393857181,-1.908626437,-2.761430502,-0.1612526327,0.6691927314,0.07157991081,0.2203524262,0.1310779154,-0.2922231555,-0.4276319742,-0.1513710022,-0.05923350528,-0.7084555626,-2.292855978,-0.6972549558,1.055290937,0.04358210787,0.09901089966,0.009099178948 67 | 0.4351682067,0.4479300976,-0.3373376131,-1.913192391,-2.768112183,-0.1636178046,0.6682671905,0.0692614764,0.2110484838,0.1255432069,-0.2787985504,-0.407040894,-0.1441388428,-0.05638191476,-0.692502737,-2.216587543,-0.6770020127,1.020180821,0.04183861986,0.09562198818,0.008745257743 68 | 0.4362504184,0.4512277246,-0.3353759944,-1.917548656,-2.774472237,-0.1658699811,0.6673862338,0.06691975892,0.2018831968,0.1201589033,-0.2659319937,-0.3874396384,-0.1372458339,-0.05366767198,-0.6758520007,-2.140482187,-0.6565001607,0.9855268598,0.0401469171,0.09227785468,0.008402713574 69 | 0.4372960329,0.4543821514,-0.3334985077,-1.921703815,-2.780525923,-0.1680144519,0.6665476561,0.06456464529,0.1928775162,0.1149269342,-0.2536018491,-0.368780762,-0.1306763291,-0.05108390749,-0.6586300731,-2.064765692,-0.6358581185,0.9514898062,0.03850866482,0.08899307251,0.008071147837 70 | 0.4383048415,0.4573958516,-0.3317027688,-1.925666332,-2.786288023,-0.1700562686,0.6657494903,0.06220534444,0.1840503663,0.1098481715,-0.2417849749,-0.3510194421,-0.1244152784,-0.04862380028,-0.6409112215,-1.989667296,-0.6151273847,0.9178655148,0.03692013025,0.08575350791,0.007750111632 71 | 0.4392768145,0.4602716267,-0.3299863935,-1.929444194,-2.791772604,-0.1720002592,0.6649897695,0.05985035002,0.1754169315,0.1049239114,-0.2304640412,-0.3341118395,-0.1184485778,-0.0462824516,-0.6228089333,-1.915370822,-0.5944007635,0.8848097324,0.03538250923,0.08257258683,0.007439515088 72 | 0.4402119815,0.4630125165,-0.3283469677,-1.933045149,-2.796993017,-0.1738510132,0.6642665863,0.05750740692,0.1669907272,0.1001545414,-0.2196200341,-0.3180170357,-0.1127626225,-0.04405361041,-0.6044149399,-1.842044592,-0.5737478733,0.8523536921,0.03389493749,0.0794538334,0.007138856687 73 | 0.4411105216,0.4656217396,-0.3267820477,-1.936476707,-2.801962137,-0.1756129265,0.6635782719,0.05518358201,0.1587831378,0.09553999454,-0.2092347294,-0.3026961982,-0.1073444933,-0.04193189368,-0.5858139992,-1.769841313,-0.5532319546,0.8205248713,0.03245694563,0.07640042156,0.006847959477 74 | 0.4419727623,0.4681027234,-0.3252892494,-1.939746022,-2.806691647,-0.1772901863,0.6629230976,0.05288530141,0.1508035362,0.09107973427,-0.1992906481,-0.2881121635,-0.102181673,-0.03991211951,-0.5666686893,-1.697759032,-0.5325611234,0.7889673114,0.0310611017,0.07338098437,0.006565776188 75 | 0.4427990913,0.4704590142,-0.3238661289,-1.942859888,-2.811193466,-0.1788867712,0.6622994542,0.05062165856,0.1430660188,0.08677367121,-0.1897743046,-0.2742264867,-0.09726103395,-0.03799046203,-0.5481389165,-1.628986716,-0.5126609206,0.7584556937,0.0297195632,0.07046501338,0.00629402278 76 | 0.443590045,0.4726944268,-0.3225103021,-1.9458251,-2.815478325,-0.180406481,0.6617058516,0.04839053005,0.1355638504,0.08262271434,-0.1806692034,-0.2610118687,-0.09257523715,-0.03616001457,-0.5293614864,-1.560881615,-0.4928801954,0.7286660075,0.02842600457,0.0676234439,0.006031180266 77 | 0.4443461597,0.4748126268,-0.3212193251,-1.948648095,-2.819556713,-0.1818529665,0.6611408591,0.04620121419,0.1283075362,0.07862105221,-0.1719565541,-0.2484323829,-0.08811034262,-0.03441727906,-0.5106551051,-1.494330406,-0.4734219313,0.6995820999,0.02717776224,0.06485396624,0.005776790436 78 | 0.4450680614,0.4768174291,-0.3199908733,-1.951334953,-2.823438406,-0.1832296848,0.660603106,0.04405746982,0.1212999374,0.07476712763,-0.1636214405,-0.2364581972,-0.08385671675,-0.03275864571,-0.4920838475,-1.429409862,-0.4543305039,0.6712672114,0.0259762872,0.06216234341,0.005531410687 79 | 0.4457564652,0.4787127376,-0.3188226223,-1.953891516,-2.827133179,-0.1845399439,0.6600912809,0.04196260497,0.1145428047,0.07105877995,-0.1556486189,-0.2250598073,-0.07980436087,-0.03117946722,-0.473682791,-1.366177082,-0.4356205761,0.6436408162,0.02481738105,0.05954186246,0.005294112954 80 | 0.4464121163,0.4805024564,-0.3177123368,-1.956323504,-2.830649853,-0.185786888,0.6596040726,0.03991941363,0.1080363393,0.06749398261,-0.1480251402,-0.2142102271,-0.07594421506,-0.02967649326,-0.4555028975,-1.304680347,-0.4173257053,0.616758585,0.02370197698,0.0569970198,0.005065237172 81 | 0.4470358491,0.4821905196,-0.3166577518,-1.958636403,-2.833996773,-0.1869735122,0.6591403484,0.03793036193,0.1017799005,0.06407012045,-0.1407376528,-0.203882888,-0.07226729393,-0.02824584767,-0.4375836551,-1.24495542,-0.3994668722,0.590626955,0.022628773,0.0545280017,0.004844472278 82 | 0.4476284981,0.4837808311,-0.315656662,-1.960835457,-2.837182522,-0.1881026924,0.6586990356,0.03599748015,0.09577192366,0.06078436971,-0.1337732822,-0.1940526813,-0.06876513362,-0.02688404918,-0.4199605584,-1.187028885,-0.3820621967,0.5652502179,0.02159666456,0.05213497579,0.004631639458 83 | 0.4481909573,0.4852772653,-0.3147068918,-1.962925673,-2.840214491,-0.1891771406,0.6582790017,0.03412241861,0.09001000971,0.05763369426,-0.1271196008,-0.1846956909,-0.06542956084,-0.0255877506,-0.402664125,-1.130917907,-0.365125984,0.5406318307,0.02060463652,0.04981780052,0.004426551983 84 | 0.4487241209,0.4866836667,-0.313806355,-1.964911938,-2.843100309,-0.1901994795,0.6578791738,0.03230649978,0.08449103683,0.05461483449,-0.1207645983,-0.175789088,-0.06225283816,-0.02435382456,-0.3857217133,-1.076632619,-0.3486687541,0.5167713761,0.01965146698,0.04757606983,0.004228991456 85 | 0.4492289126,0.48800385,-0.3129529953,-1.966798902,-2.84584713,-0.1911721826,0.6574986577,0.03055064008,0.07921122015,0.05172451213,-0.1146966517,-0.1673113108,-0.05922760442,-0.02317926101,-0.36915645,-1.024174333,-0.3326985538,0.4936674833,0.01873614267,0.04540929943,0.004038783256 86 | 0.4497062564,0.4892415106,-0.3121447861,-1.968591094,-2.84846139,-0.1920976192,0.6571364999,0.02885544114,0.07416622341,0.04895931482,-0.1089045405,-0.1592417359,-0.0563467741,-0.02206124365,-0.3529878557,-0.9735395908,-0.3172212839,0.4713168144,0.01785758696,0.04331680387,0.003855767194 87 | 0.4501571357,0.4904003441,-0.3113797903,-1.970292687,-2.850949526,-0.1929780394,0.6567918062,0.02722124569,0.06935122609,0.04631572217,-0.1033774242,-0.1515607387,-0.05360361934,-0.02099700831,-0.3372320533,-0.924716413,-0.3022401631,0.449714154,0.01701463945,0.04129773006,0.003679719288 88 | 0.4505824745,0.4914839566,-0.3106561005,-1.971907973,-2.853317738,-0.193815589,0.6564637423,0.02564814687,0.06476099044,0.04379013553,-0.0981048122,-0.1442496777,-0.05099172145,-0.01998400688,-0.3219030499,-0.8776893616,-0.2877560258,0.4288532734,0.01620633528,0.03935112432,0.003510491457 89 | 0.4509832263,0.4924958348,-0.309971869,-1.973440886,-2.855571747,-0.1946123391,0.6561514735,0.02413593791,0.06038993597,0.04137897864,-0.09307654947,-0.1372907311,-0.04850495607,-0.01901976578,-0.3070108294,-0.8324378133,-0.2737682164,0.4087259471,0.01543151122,0.03747589886,0.003347876016 90 | 0.4513603449,0.493439436,-0.3093253076,-1.974895239,-2.857716799,-0.1953702271,0.6558542848,0.02268424816,0.05623219535,0.03907858953,-0.08828282356,-0.1306670308,-0.0461374484,-0.01810194924,-0.292563647,-0.7889370322,-0.2602742016,0.3893229365,0.01468919124,0.03567082062,0.003191707423 91 | 0.4517147839,0.494318068,-0.3087147176,-1.97627461,-2.859758377,-0.1960911304,0.6555714607,0.02129248343,0.05228166655,0.03688533232,-0.08371414244,-0.1243624166,-0.04388364032,-0.017228337,-0.2785670459,-0.7471584678,-0.2472697049,0.3706340194,0.01397831831,0.03393467516,0.003041805001 92 | 0.4520474672,0.4951349795,-0.3081383705,-1.977582693,-2.861701488,-0.1967768073,0.6553022861,0.01995986328,0.048532065,0.0347955972,-0.07936131954,-0.1183615476,-0.04173816368,-0.01639678143,-0.2650242448,-0.7070704699,-0.2347497791,0.3526471257,0.01329779997,0.03226603195,0.002897963859 93 | 0.4523593485,0.4958932996,-0.3075946867,-1.978822708,-2.863550901,-0.1974289715,0.6550461054,0.01868549362,0.04497697577,0.03280573338,-0.07521548122,-0.1126498356,-0.03969594464,-0.01560529787,-0.2526293695,-0.6706873178,-0.2232796699,0.3356391191,0.01265542675,0.03069613315,0.002762634773 94 | 0.4526513219,0.4965960681,-0.3070820868,-1.979997993,-2.865311146,-0.1980492175,0.6548022628,0.01746317744,0.0415959619,0.03091059253,-0.07127005607,-0.1072124243,-0.03775122017,-0.01485297363,-0.2399577945,-0.6337639093,-0.2116873711,0.3190208673,0.01203166042,0.02915699966,0.002630427247 95 | 0.452924192,0.4972459972,-0.3065991104,-1.981111646,-2.866986275,-0.1986390799,0.6545701623,0.01629783213,0.03839724511,0.02910715714,-0.06751421094,-0.1020379066,-0.03590139747,-0.01413557492,-0.2277424783,-0.5984229445,-0.20055224,0.3030598462,0.01143565774,0.0276815258,0.002503463766 96 | 0.4531788528,0.4978459477,-0.3061442971,-1.982166529,-2.86858058,-0.1992000341,0.6543492675,0.01518779434,0.03537417203,0.02739254758,-0.06394010782,-0.09711292386,-0.03414084017,-0.013453044,-0.2159790248,-0.5646249652,-0.1898676902,0.2877433896,0.01086638402,0.02626715787,0.002382005565 97 | 0.4534161687,0.4983986616,-0.3057162762,-1.983165622,-2.870097876,-0.1997334808,0.6541390419,0.01413150225,0.0325201191,0.02576362714,-0.06054004282,-0.09242533892,-0.03246531636,-0.01280340832,-0.2046622187,-0.5323289633,-0.1796251386,0.2730561495,0.01032263227,0.02491227351,0.002265748335 98 | 0.4536369741,0.4989067912,-0.3053137064,-1.984111547,-2.871541977,-0.2002407461,0.6539390087,0.01312741823,0.02982849628,0.02421717159,-0.05730658025,-0.08796370775,-0.03087086044,-0.01218507718,-0.1937855333,-0.5014916062,-0.1698149443,0.2589823008,0.009803353809,0.02361527458,0.002154496498 99 | 0.4538421035,0.4993728697,-0.3049353063,-1.985006928,-2.87291646,-0.2007230967,0.6537486315,0.01217402518,0.02729279175,0.02274991758,-0.05423253775,-0.08371718228,-0.0293536596,-0.01159654465,-0.1833422482,-0.4720694125,-0.1604266167,0.2455061525,0.009307648055,0.02237456664,0.002048079856 100 | 0.4540323317,0.4997993112,-0.3045798242,-1.985854268,-2.874224424,-0.2011817545,0.6535674334,0.01126977615,0.02490657009,0.021358639,-0.0513109751,-0.07967542857,-0.02791005187,-0.01103638578,-0.1733246595,-0.444018811,-0.1514490694,0.2326106429,0.00883462932,0.02118841745,0.001946338336 101 | 0.4542084336,0.5001884699,-0.3042460978,-1.986655951,-2.875469446,-0.2016178519,0.6533949971,0.01041310374,0.02266349085,0.02004017495,-0.04853520542,-0.0758285895,-0.02653654106,-0.01050323714,-0.1637242883,-0.417295754,-0.1428709626,0.2202794403,0.008383398876,0.02005516179,0.001849099761 102 | 0.4543711245,0.5005425811,-0.3039329648,-1.98741436,-2.876654387,-0.2020324916,0.6532309055,0.009602421895,0.02055733092,0.01879142225,-0.0458987765,-0.07216730714,-0.02522980422,-0.009995797649,-0.154532209,-0.3918561041,-0.1346806437,0.2084955275,0.007953152061,0.01897310466,0.001756212441 103 | 0.4545211494,0.5008637905,-0.3036393523,-1.988131523,-2.877782106,-0.2024267018,0.6530747414,0.008836132474,0.01858198829,0.01760936715,-0.04339547083,-0.06868264079,-0.02398666367,-0.009512835182,-0.1457388699,-0.3676556051,-0.1268662065,0.1972418576,0.00754306186,0.01794055663,0.001667518751 104 | 0.4546592236,0.5011541247,-0.3033642173,-1.988809586,-2.878855228,-0.2028014958,0.6529260874,0.008112620562,0.01673150063,0.01649108902,-0.04101929441,-0.06536606699,-0.02280407585,-0.009053166024,-0.1373342127,-0.3446505964,-0.119415924,0.1865013987,0.007152289618,0.01695582084,0.00158285175 105 | 0.4547859728,0.5014155507,-0.3031065464,-1.989450455,-2.879876614,-0.2031578124,0.6527846456,0.007430285681,0.01500004902,0.0154337259,-0.03876447678,-0.06220950559,-0.02167916298,-0.008615679108,-0.1293216944,-0.3228315115,-0.1123270616,0.1762599647,0.006778756622,0.01601735502,0.001501632622 106 | 0.4549020827,0.5016499162,-0.302865386,-1.990056157,-2.880848646,-0.2034965456,0.6526499987,0.006787302904,0.01338172052,0.01443466451,-0.03662548214,-0.05920651928,-0.02060947567,-0.008199578151,-0.1216609851,-0.3020819724,-0.1055682003,0.1664951593,0.006425974425,0.01512358617,0.001425127848 107 | 0.4550081491,0.5018590093,-0.302639842,-1.990628481,-2.88177371,-0.2038185745,0.6525219083,0.006182392593,0.01187130623,0.01349099912,-0.03459697962,-0.05634705722,-0.01959168166,-0.007803237066,-0.1143573448,-0.282399267,-0.09913746268,0.1571926177,0.006088545546,0.01427233033,0.001351634157 108 | 0.4551047385,0.5020444989,-0.3024290502,-1.991169095,-2.88265419,-0.2041246891,0.6523999572,0.005613923538,0.01046345942,0.01260021795,-0.03267377988,-0.05362561345,-0.01862370037,-0.007426050026,-0.1073997021,-0.2637417912,-0.09302335978,0.1483357847,0.005767463706,0.01346235909,0.001281589153 109 | 0.4551924467,0.5022079945,-0.3022321761,-1.991679668,-2.883491993,-0.204415679,0.6522839069,0.005080338102,0.009153005667,0.01175983716,-0.0308509469,-0.05103553832,-0.0177031178,-0.0070670899,-0.1007769257,-0.2460685223,-0.08721423149,0.1399082392,0.005462038796,0.0126920836,0.001214856165 110 | 0.4552718401,0.502350986,-0.3020484149,-1.992161751,-2.884289503,-0.2046922892,0.6521734595,0.00458010193,0.007934940048,0.01096746232,-0.02912373841,-0.04857050255,-0.01682764664,-0.006725465879,-0.09447791427,-0.2293392569,-0.08169858158,0.1318937093,0.005171630532,0.01195993926,0.001151309232 111 | 0.4553433955,0.5024749637,-0.3018770516,-1.992616773,-2.885048389,-0.2049552202,0.6520683765,0.004111705814,0.006804431789,0.01022079308,-0.02748760395,-0.04622444883,-0.01599510945,-0.006400333717,-0.08849134296,-0.2135147899,-0.07646526396,0.1242762804,0.004895556718,0.01126441732,0.001090806327 112 | 0.4554076493,0.5025812984,-0.3017173409,-1.993046284,-2.885770559,-0.2052051425,0.6519683599,0.003673688974,0.00575682288,0.009517588653,-0.02593817562,-0.04399163648,-0.01520343963,-0.006090905983,-0.08280608058,-0.1985568255,-0.07150306553,0.1170402542,0.004633172881,0.01060403511,0.001033217646 113 | 0.4554650486,0.5026712418,-0.3015686274,-1.993451595,-2.88645792,-0.2054426968,0.6518731713,0.00326461927,0.004787627608,0.008855705149,-0.0244712662,-0.04186661541,-0.01445065998,-0.005796423648,-0.07741107047,-0.1844279468,-0.06680120528,0.1101703122,0.004383870866,0.009977333248,0.0009784203721 114 | 0.4555160701,0.5027460456,-0.3014302552,-1.993833899,-2.887112141,-0.2056684941,0.6517826319,0.00288310647,0.003892531618,0.008233074099,-0.02308285609,-0.03984420002,-0.01373489946,-0.005516166799,-0.07229542732,-0.1710918993,-0.06234893203,0.1036514193,0.004147099797,0.009382919408,0.0009263054817 115 | 0.4555611312,0.5028068423,-0.3013015985,-1.994194627,-2.887734652,-0.2058831006,0.6516964436,0.002527792007,0.003067390062,0.007647718769,-0.02176910266,-0.03791942447,-0.01305435225,-0.005249446724,-0.067448318,-0.1585132927,-0.05813586712,0.09746901691,0.003922280855,0.008819400333,0.0008767513791 116 | 0.4556006193,0.5028547645,-0.3011820912,-1.994534731,-2.888327122,-0.2060870677,0.6516144276,0.002197360387,0.002308227122,0.007097736932,-0.02052631974,-0.03608758003,-0.01240732521,-0.004995612893,-0.06285914034,-0.1466578543,-0.05415191501,0.09160877019,0.003708850592,0.008285460062,0.0008296433953 117 | 0.4556349516,0.5028908253,-0.3010711968,-1.994855404,-2.888890982,-0.2062809318,0.6515363455,0.001890536048,0.001611231826,0.006581305992,-0.01935098134,-0.03434419632,-0.01179218199,-0.004754042253,-0.05851742253,-0.1354924291,-0.05038718507,0.08605674654,0.003506304231,0.007779780775,0.000784878328 118 | 0.4556644857,0.5029159784,-0.3009683788,-1.995157719,-2.889427662,-0.2064651847,0.6514620781,0.001606088364,0.0009727548459,0.006096674595,-0.01823971607,-0.0326850079,-0.01120738126,-0.004524147604,-0.05459246039,-0.1254370809,-0.04698073491,0.08092487603,0.003315753071,0.007312920876,0.0007426269003 119 | -------------------------------------------------------------------------------- /include/common/dz.cuh: -------------------------------------------------------------------------------- 1 | #include "utils/matrix.cuh" 2 | 3 | template 4 | __global__ 5 | void compute_dz_kernel(uint32_t state_size, uint32_t control_size, uint32_t knot_points, T *d_G_dense, T *d_C_dense, T *d_g_val, T *d_lambda, T *d_dz){ 6 | 7 | extern __shared__ T s_mem[]; 8 | 9 | const uint32_t states_sq = state_size*state_size; 10 | const uint32_t states_p_controls = state_size * control_size; 11 | const uint32_t controls_sq = control_size * control_size; 12 | const uint32_t states_s_controls = state_size + control_size; 13 | unsigned set; 14 | 15 | for(int blockrow = blockIdx.x; blockrow < 2*knot_points-1; blockrow+=gridDim.x){ 16 | 17 | set = blockrow/2; 18 | 19 | if(blockrow%2){ // control row 20 | // shared mem config 21 | // Rkinv | BkT 22 | // C^2 | S*C 23 | 24 | T *s_Rk_i = s_mem; 25 | T *s_BkT = s_Rk_i + controls_sq; 26 | T *s_scratch = s_BkT + states_p_controls; 27 | 28 | // load Rkinv from G 29 | glass::copy(controls_sq, d_G_dense+set*(states_sq+controls_sq)+states_sq, s_Rk_i); 30 | 31 | // load Bk from C 32 | glass::copy(states_p_controls, d_C_dense+set*(states_sq+states_p_controls)+states_sq, s_BkT); 33 | 34 | __syncthreads(); 35 | 36 | // // compute BkT*lkp1 37 | gato_ATx(s_scratch, 38 | s_BkT, 39 | d_lambda+(set+1)*state_size, 40 | state_size, 41 | control_size); 42 | __syncthreads(); 43 | 44 | // subtract from rk 45 | gato_vec_dif(s_scratch, 46 | d_g_val+set*(states_s_controls)+state_size, 47 | s_scratch, 48 | control_size); 49 | __syncthreads(); 50 | 51 | // multiply Rk_i*scratch in scratch + C 52 | mat_vec_prod( control_size, control_size,s_Rk_i, 53 | s_scratch, 54 | s_scratch+control_size); 55 | __syncthreads(); 56 | 57 | // store in d_dz 58 | glass::copy(control_size, s_scratch+control_size, d_dz+set*(states_s_controls)+state_size); 59 | 60 | } 61 | else{ // state row 62 | 63 | T *s_Qk_i = s_mem; 64 | T *s_AkT = s_Qk_i + states_sq; 65 | T *s_scratch = s_AkT + states_sq; 66 | 67 | // shared mem config 68 | // Qkinv | AkT | scratch 69 | // S^2 S^2 70 | 71 | /// TODO: error check 72 | // load Qkinv from G 73 | glass::copy(states_sq, d_G_dense+set*(states_sq+controls_sq), s_Qk_i); 74 | 75 | ///TODO: linsys solver hasn't been checked with this change 76 | if(set != knot_points-1){ 77 | // load Ak from C 78 | glass::copy(states_sq, d_C_dense+set*(states_sq+states_p_controls), s_AkT); 79 | __syncthreads(); 80 | 81 | // // compute AkT*lkp1 in scratch 82 | gato_ATx(s_scratch, 83 | s_AkT, 84 | d_lambda+(set+1)*state_size, 85 | state_size, 86 | state_size); 87 | __syncthreads(); 88 | } 89 | else{ 90 | for(int i = threadIdx.x; i < state_size; i+=blockDim.x){ 91 | s_scratch[i] = 0; 92 | } 93 | } 94 | 95 | 96 | // add lk to scratch 97 | gato_vec_sum(s_scratch, // out 98 | d_lambda+set*state_size, 99 | s_scratch, 100 | state_size); 101 | __syncthreads(); 102 | 103 | // subtract from qk in scratch 104 | gato_vec_dif(s_scratch, 105 | d_g_val+set*(states_s_controls), 106 | s_scratch, 107 | state_size); 108 | __syncthreads(); 109 | 110 | 111 | // multiply Qk_i(scratch) in Akt 112 | mat_vec_prod( state_size, state_size,s_Qk_i, 113 | s_scratch, 114 | s_AkT); 115 | __syncthreads(); 116 | 117 | // store in dz 118 | glass::copy(state_size, s_AkT, d_dz+set*(states_s_controls)); 119 | } 120 | } 121 | } 122 | 123 | 124 | template 125 | void compute_dz(uint32_t state_size, uint32_t control_size, uint32_t knot_points, T *d_G_dense, T *d_C_dense, T *d_g_val, T *d_lambda, T *d_dz){ 126 | 127 | compute_dz_kernel<<>>( 128 | state_size, 129 | control_size, 130 | knot_points, 131 | d_G_dense, 132 | d_C_dense, 133 | d_g_val, 134 | d_lambda, 135 | d_dz 136 | ); 137 | } -------------------------------------------------------------------------------- /include/common/integrator.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | 6 | namespace cgrps = cooperative_groups; 7 | #include "dynamics/rbd_plant.cuh" 8 | 9 | #include "glass.cuh" 10 | 11 | 12 | template 13 | __host__ __device__ 14 | T angleWrap(T input){ 15 | const T pi = static_cast(3.14159); 16 | if(input > pi){input = -(input - pi);} 17 | if(input < -pi){input = -(input + pi);} 18 | return input; 19 | } 20 | 21 | 22 | template 23 | __device__ 24 | void exec_integrator_error(uint32_t state_size, T *s_err, T *s_qkp1, T *s_qdkp1, T *s_q, T *s_qd, T *s_qdd, T dt, cgrps::thread_block block, bool absval = false){ 25 | T new_qkp1; T new_qdkp1; 26 | for (unsigned ind = threadIdx.x; ind < state_size/2; ind += blockDim.x){ 27 | // euler xk = xk + dt *dxk 28 | if (INTEGRATOR_TYPE == 0){ 29 | new_qkp1 = s_q[ind] + dt*s_qd[ind]; 30 | new_qdkp1 = s_qd[ind] + dt*s_qdd[ind]; 31 | } 32 | // semi-inplicit euler 33 | // qdkp1 = qdk + dt*qddk 34 | // qkp1 = qk + dt*qdkp1 35 | else if (INTEGRATOR_TYPE == 1){ 36 | new_qdkp1 = s_qd[ind] + dt*s_qdd[ind]; 37 | new_qkp1 = s_q[ind] + dt*new_qdkp1; 38 | } 39 | else {printf("Integrator [%d] not defined. Currently support [0: Euler and 1: Semi-Implicit Euler]",INTEGRATOR_TYPE);} 40 | 41 | // wrap angles if needed 42 | if(ANGLE_WRAP){ printf("ANGLE_WRAP!\n"); 43 | new_qkp1 = angleWrap(new_qkp1); 44 | } 45 | 46 | // then computre error 47 | if(absval){ 48 | s_err[ind] = abs(s_qkp1[ind] - new_qkp1); 49 | s_err[ind + state_size/2] = abs(s_qdkp1[ind] - new_qdkp1); 50 | } 51 | else{ 52 | s_err[ind] = s_qkp1[ind] - new_qkp1; 53 | s_err[ind + state_size/2] = s_qdkp1[ind] - new_qdkp1; 54 | } 55 | // printf("err[%f] with new qkp1[%f] vs orig[%f] and new qdkp1[%f] vs orig[%f] with qk[%f] qdk[%f] qddk[%f] and dt[%f]\n",s_err[ind],new_qkp1,s_qkp1[ind],new_qdkp1,s_qdkp1[ind],s_q[ind],s_qd[ind],s_qdd[ind],dt); 56 | } 57 | } 58 | 59 | template 60 | __device__ 61 | void exec_integrator_gradient(uint32_t state_size, uint32_t control_size, T *s_Ak, T *s_Bk, T *s_dqdd, T dt, cgrps::thread_block block){ 62 | 63 | const uint32_t thread_id = threadIdx.x; 64 | const uint32_t block_dim = blockDim.x; 65 | 66 | // and finally A and B 67 | if (INTEGRATOR_TYPE == 0){ 68 | // then apply the euler rule -- xkp1 = xk + dt*dxk thus AB = [I_{state},0_{control}] + dt*dxd 69 | // where dxd = [ 0, I, 0; dqdd/dq, dqdd/dqd, dqdd/du] 70 | for (unsigned ind = thread_id; ind < state_size*(state_size + control_size); ind += block_dim){ 71 | int c = ind / state_size; int r = ind % state_size; 72 | T *dst = (c < state_size)? &s_Ak[ind] : &s_Bk[ind - state_size*state_size]; // dst 73 | T val = (r == c) * static_cast(1); // first term (non-branching) 74 | val += (r < state_size/2 && r == c - state_size/2) * dt; // first dxd term (non-branching) 75 | if(r >= state_size/2) { val += dt * s_dqdd[c*state_size/2 + r - state_size/2]; } 76 | ///TODO: EMRE why didn't this error before? 77 | // val += (r >= state_size/2) * dt * s_dqdd[c*state_size/2 + r - state_size/2]; // second dxd term (non-branching) 78 | *dst = val; 79 | } 80 | } 81 | else if (INTEGRATOR_TYPE == 1){ 82 | // semi-inplicit euler 83 | // qdkp1 = qdk + dt*qddk 84 | // qkp1 = qk + dt*qdkp1 = qk + dt*qdk + dt^2*qddk 85 | // dxkp1 = [Ix | 0u ] + dt*[[0q, Iqd, 0u] + dt*dqdd 86 | // dqdd] 87 | // Ak = I + dt * [[0,I] + dt*dqdd/dx; dqdd/dx] 88 | // Bk = [dt*dqdd/du; dqdd/du] 89 | for (unsigned ind = thread_id; ind < state_size*state_size; ind += block_dim){ 90 | int c = ind / state_size; int r = ind % state_size; int rdqdd = r % (state_size/2); 91 | T dtVal = static_cast((r == rdqdd)*dt + (r != rdqdd)); 92 | s_Ak[ind] = static_cast((r == c) + dt*(r == c - state_size/2)) + 93 | dt * s_dqdd[c*state_size/2 + rdqdd] * dtVal; 94 | if(c < control_size){ 95 | s_Bk[ind] = dt * s_dqdd[state_size*state_size/2 + c*state_size/2 + rdqdd] * dtVal; 96 | } 97 | } 98 | } 99 | else{printf("Integrator [%d] not defined. Currently support [0: Euler and 1: Semi-Implicit Euler]",INTEGRATOR_TYPE);} 100 | } 101 | 102 | 103 | template 104 | __device__ 105 | void exec_integrator(uint32_t state_size, T *s_qkp1, T *s_qdkp1, T *s_q, T *s_qd, T *s_qdd, T dt, cgrps::thread_block block){ 106 | 107 | const uint32_t thread_id = threadIdx.x; 108 | const uint32_t block_dim = blockDim.x; 109 | 110 | for (unsigned ind = thread_id; ind < state_size/2; ind += block_dim){ 111 | // euler xk = xk + dt *dxk 112 | if (INTEGRATOR_TYPE == 0){ 113 | s_qkp1[ind] = s_q[ind] + dt*s_qd[ind]; 114 | s_qdkp1[ind] = s_qd[ind] + dt*s_qdd[ind]; 115 | } 116 | // semi-inplicit euler 117 | // qdkp1 = qdk + dt*qddk 118 | // qkp1 = qk + dt*qdkp1 119 | else if (INTEGRATOR_TYPE == 1){ 120 | s_qdkp1[ind] = s_qd[ind] + dt*s_qdd[ind]; 121 | s_qkp1[ind] = s_q[ind] + dt*s_qdkp1[ind]; 122 | } 123 | else{printf("Integrator [%d] not defined. Currently support [0: Euler and 1: Semi-Implicit Euler]",INTEGRATOR_TYPE);} 124 | 125 | // wrap angles if needed 126 | if(ANGLE_WRAP){ 127 | s_qkp1[ind] = angleWrap(s_qkp1[ind]); 128 | } 129 | } 130 | } 131 | 132 | // s_temp of size state_size/2*(state_size + control_size + 1) + DYNAMICS_TEMP 133 | template 134 | __device__ __forceinline__ 135 | void integratorAndGradient(uint32_t state_size, uint32_t control_size, T *s_xux, T *s_Ak, T *s_Bk, T *s_xnew_err, T *s_temp, void *d_dynMem_const, T dt, cgrps::thread_block block){ 136 | 137 | 138 | // first compute qdd and dqdd 139 | T *s_qdd = s_temp; 140 | T *s_dqdd = s_qdd + state_size/2; 141 | T *s_extra_temp = s_dqdd + state_size/2*(state_size+control_size); 142 | T *s_q = s_xux; 143 | T *s_qd = s_q + state_size/2; 144 | T *s_u = s_qd + state_size/2; 145 | gato_plant::forwardDynamicsAndGradient(s_dqdd, s_qdd, s_q, s_qd, s_u, s_extra_temp, d_dynMem_const); 146 | block.sync(); 147 | // first compute xnew or error 148 | if (COMPUTE_INTEGRATOR_ERROR){ 149 | exec_integrator_error(state_size, s_xnew_err, &s_xux[state_size+control_size], &s_xux[state_size+control_size+state_size/2], s_q, s_qd, s_qdd, dt, block); 150 | } 151 | else{ 152 | exec_integrator(state_size, s_xnew_err, &s_xnew_err[state_size/2], s_q, s_qd, s_qdd, dt, block); 153 | } 154 | 155 | // then compute gradient 156 | exec_integrator_gradient(state_size, control_size, s_Ak, s_Bk, s_dqdd, dt, block); 157 | } 158 | 159 | 160 | // s_temp of size 3*state_size/2 + DYNAMICS_TEMP 161 | template 162 | __device__ 163 | T integratorError(uint32_t state_size, T *s_xuk, T *s_xkp1, T *s_temp, void *d_dynMem_const, T dt, cgrps::thread_block block){ 164 | 165 | // first compute qdd 166 | T *s_q = s_xuk; 167 | T *s_qd = s_q + state_size/2; 168 | T *s_u = s_qd + state_size/2; 169 | T *s_qkp1 = s_xkp1; 170 | T *s_qdkp1 = s_qkp1 + state_size/2; 171 | T *s_qdd = s_temp; 172 | T *s_err = s_qdd + state_size/2; 173 | T *s_extra_temp = s_err + state_size/2; 174 | gato_plant::forwardDynamics(s_qdd, s_q, s_qd, s_u, s_extra_temp, d_dynMem_const, block); 175 | block.sync(); 176 | // if(blockIdx.x == 0 && threadIdx.x==0){ 177 | // printf("\n"); 178 | // for(int i = 0; i < state_size/2; i++){ 179 | // printf("%f ", s_qdd[i]); 180 | // } 181 | // printf("\n"); 182 | // } 183 | // block.sync(); 184 | // then apply the integrator and compute error 185 | exec_integrator_error(state_size, s_err, s_qkp1, s_qdkp1, s_q, s_qd, s_qdd, dt, block, true); 186 | block.sync(); 187 | 188 | // finish off forming the error 189 | glass::reduce(state_size, s_err); 190 | block.sync(); 191 | // if(GATO_LEAD_THREAD){printf("in integratorError with reduced error of [%f]\n",s_err[0]);} 192 | return s_err[0]; 193 | } 194 | 195 | 196 | 197 | template 198 | __device__ 199 | void integrator(uint32_t state_size, T *s_xkp1, T *s_xuk, T *s_temp, void *d_dynMem_const, T dt, cgrps::thread_block block){ 200 | // first compute qdd 201 | T *s_q = s_xuk; T *s_qd = s_q + state_size/2; T *s_u = s_qd + state_size/2; 202 | T *s_qkp1 = s_xkp1; T *s_qdkp1 = s_qkp1 + state_size/2; 203 | T *s_qdd = s_temp; T *s_extra_temp = s_qdd + state_size/2; 204 | gato_plant::forwardDynamics(s_qdd, s_q, s_qd, s_u, s_extra_temp, d_dynMem_const, block); 205 | block.sync(); 206 | exec_integrator(state_size, s_qkp1, s_qdkp1, s_q, s_qd, s_qdd, dt, block); 207 | } 208 | 209 | 210 | 211 | 212 | template 213 | __global__ 214 | void integrator_kernel(uint32_t state_size, uint32_t control_size, T *d_xkp1, T *d_xuk, void *d_dynMem_const, T dt){ 215 | extern __shared__ T s_smem[]; 216 | T *s_xkp1 = s_smem; 217 | T *s_xuk = s_xkp1 + state_size; 218 | T *s_temp = s_xuk + state_size + control_size; 219 | cgrps::thread_block block = cgrps::this_thread_block(); 220 | cgrps::grid_group grid = cgrps::this_grid(); 221 | for (unsigned ind = threadIdx.x; ind < state_size + control_size; ind += blockDim.x){ 222 | s_xuk[ind] = d_xuk[ind]; 223 | } 224 | 225 | block.sync(); 226 | integrator(state_size, s_xkp1, s_xuk, s_temp, d_dynMem_const, dt, block); 227 | block.sync(); 228 | 229 | for (unsigned ind = threadIdx.x; ind < state_size; ind += blockDim.x){ 230 | d_xkp1[ind] = s_xkp1[ind]; 231 | } 232 | } 233 | 234 | // We take start state from h_xs, and control input from h_xu, and update h_xs 235 | template 236 | void integrator_host(uint32_t state_size, uint32_t control_size, T *d_xs, T *d_xu, void *d_dynMem_const, T dt){ 237 | // T *d_xu; 238 | // T *d_xs_new; 239 | // gpuErrchk(cudaMalloc(&d_xu, xu_size)); 240 | // gpuErrchk(cudaMalloc(&d_xs_new, xs_size)); 241 | 242 | // gpuErrchk(cudaMemcpy(d_xu, h_xs, state_size*sizeof(T), cudaMemcpyHostToDevice)); 243 | // gpuErrchk(cudaMemcpy(d_xu + state_size, h_xu + state_size, control_size*sizeof(T), cudaMemcpyHostToDevice)); 244 | //TODO: needs sync? 245 | 246 | const size_t integrator_kernel_smem_size = sizeof(T)*(2*state_size + control_size + state_size/2 + gato_plant::forwardDynamicsAndGradient_TempMemSize_Shared()); 247 | //TODO: one block one thread? Why? 248 | integrator_kernel<<<1,1, integrator_kernel_smem_size>>>(state_size, control_size, d_xs, d_xu, d_dynMem_const, dt); 249 | 250 | //TODO: needs sync? 251 | // gpuErrchk(cudaMemcpy(h_xs, d_xs_new, xs_size, cudaMemcpyDeviceToHost)); 252 | 253 | // gpuErrchk(cudaFree(d_xu)); 254 | // gpuErrchk(cudaFree(d_xs_new)); 255 | } 256 | 257 | template 258 | void just_shift(uint32_t state_size, uint32_t control_size, uint32_t knot_points, T *d_xu){ 259 | for (uint32_t knot = 0; knot < knot_points-1; knot++){ 260 | uint32_t stepsize = (state_size+(knot 267 | __global__ 268 | void simple_integrator_kernel(uint32_t state_size, uint32_t control_size, T *d_x, T *d_u, void *d_dynMem_const, T dt){ 269 | 270 | 271 | extern __shared__ T s_mem[]; 272 | T *s_xkp1 = s_mem; 273 | T *s_xuk = s_xkp1 + state_size; 274 | T *s_temp = s_xuk + state_size + control_size; 275 | cgrps::thread_block block = cgrps::this_thread_block(); 276 | cgrps::grid_group grid = cgrps::this_grid(); 277 | for (unsigned ind = threadIdx.x; ind < state_size + control_size; ind += blockDim.x){ 278 | if(ind < state_size){ 279 | s_xuk[ind] = d_x[ind]; 280 | } 281 | else{ 282 | s_xuk[ind] = d_u[ind-state_size]; 283 | } 284 | } 285 | 286 | block.sync(); 287 | integrator(state_size, s_xkp1, s_xuk, s_temp, d_dynMem_const, dt, block); 288 | block.sync(); 289 | 290 | for (unsigned ind = threadIdx.x; ind < state_size; ind += blockDim.x){ 291 | d_x[ind] = s_xkp1[ind]; 292 | } 293 | } 294 | 295 | template 296 | void simple_simulate(uint32_t state_size, uint32_t control_size, uint32_t knot_points, T *d_xs, T *d_xu, void *d_dynMem_const, double timestep, double time_offset_us, double sim_time_us, unsigned long long = 123456){ 297 | 298 | // std::cout << "simulating for " << sim_time_us * 1e-6 << " seconds\n"; 299 | 300 | 301 | double time_offset = time_offset_us * 1e-6; 302 | double sim_time = sim_time_us * 1e-6; 303 | 304 | const T sim_step_time = 2e-4; 305 | const size_t simple_integrator_kernel_smem_size = sizeof(T)*(2*state_size + control_size + state_size/2 + gato_plant::forwardDynamicsAndGradient_TempMemSize_Shared()); 306 | const uint32_t states_s_controls = state_size + control_size; 307 | uint32_t control_offset = static_cast((time_offset) / timestep); 308 | T *control = &d_xu[control_offset * states_s_controls + state_size]; 309 | 310 | 311 | uint32_t sim_steps_needed = static_cast(sim_time / sim_step_time); 312 | 313 | 314 | for(uint32_t step = 0; step < sim_steps_needed; step++){ 315 | control_offset = static_cast((time_offset + step * sim_step_time) / timestep); 316 | control = &d_xu[control_offset * states_s_controls + state_size]; 317 | 318 | simple_integrator_kernel<<<1,32,simple_integrator_kernel_smem_size>>>(state_size, control_size, d_xs, control, d_dynMem_const, sim_step_time); 319 | 320 | } 321 | 322 | T half_sim_step_time = fmod(sim_time, sim_step_time); 323 | 324 | simple_integrator_kernel<<<1,32,simple_integrator_kernel_smem_size>>>(state_size, control_size, d_xs, control, d_dynMem_const, half_sim_step_time); 325 | } -------------------------------------------------------------------------------- /include/common/kkt.cuh: -------------------------------------------------------------------------------- 1 | 2 | #include "dynamics/rbd_plant.cuh" 3 | #include "merit.cuh" 4 | 5 | template 6 | size_t get_kkt_smem_size(uint32_t state_size, uint32_t control_size){ 7 | const uint32_t states_sq = state_size * state_size; 8 | const uint32_t controls_sq = control_size * control_size; 9 | 10 | size_t smem_size = sizeof(T)*(3*states_sq + 11 | controls_sq + 12 | 7 * state_size + 13 | 3 * control_size + 14 | state_size*control_size + 15 | max(grid::EE_POS_SHARED_MEM_COUNT, grid::DEE_POS_SHARED_MEM_COUNT) + 16 | max((state_size/2)*(state_size + control_size + 1) + gato_plant::forwardDynamicsAndGradient_TempMemSize_Shared(), 3 + (state_size/2)*6)); 17 | 18 | return smem_size; 19 | } 20 | 21 | 22 | template 23 | __global__ 24 | void generate_kkt_submatrices(uint32_t state_size, 25 | uint32_t control_size, 26 | uint32_t knot_points, 27 | T *d_G_dense, 28 | T *d_C_dense, 29 | T *d_g, 30 | T *d_c, 31 | void *d_dynMem_const, 32 | T timestep, 33 | T *d_eePos_traj, 34 | T *d_xs, 35 | T *d_xu) 36 | { 37 | 38 | const cgrps::thread_block block = cgrps::this_thread_block(); 39 | const uint32_t thread_id = threadIdx.x; 40 | const uint32_t num_threads = blockDim.x; 41 | const uint32_t block_id = blockIdx.x; 42 | const uint32_t num_blocks = gridDim.x; 43 | 44 | const uint32_t states_sq = state_size*state_size; 45 | const uint32_t states_p_controls = state_size * control_size; 46 | const uint32_t controls_sq = control_size * control_size; 47 | const uint32_t states_s_controls = state_size + control_size; 48 | 49 | 50 | extern __shared__ T s_temp[]; 51 | 52 | T *s_xux = s_temp; 53 | T *s_eePos_traj = s_xux + 2*state_size + control_size; 54 | T *s_Qk = s_eePos_traj + 6; 55 | T *s_Rk = s_Qk + states_sq; 56 | T *s_qk = s_Rk + controls_sq; 57 | T *s_rk = s_qk + state_size; 58 | T *s_end = s_rk + control_size; 59 | 60 | 61 | for(unsigned k = block_id; k < knot_points-1; k += num_blocks){ 62 | 63 | glass::copy(2*state_size + control_size, &d_xu[k*states_s_controls], s_xux); 64 | glass::copy(2 * 6, &d_eePos_traj[k*6], s_eePos_traj); 65 | 66 | __syncthreads(); 67 | 68 | if(k==knot_points-2){ // last block 69 | 70 | T *s_Ak = s_end; 71 | T *s_Bk = s_Ak + states_sq; 72 | T *s_Qkp1 = s_Bk + states_p_controls; 73 | T *s_qkp1 = s_Qkp1 + states_sq; 74 | T *s_integrator_error = s_qkp1 + state_size; 75 | T *s_extra_temp = s_integrator_error + state_size; 76 | 77 | integratorAndGradient( 78 | state_size, control_size, 79 | s_xux, 80 | s_Ak, 81 | s_Bk, 82 | s_integrator_error, 83 | s_extra_temp, 84 | d_dynMem_const, 85 | timestep, 86 | block 87 | ); 88 | __syncthreads(); 89 | 90 | gato_plant::trackingCostGradientAndHessian_lastblock( 91 | state_size, 92 | control_size, 93 | s_xux, 94 | s_eePos_traj, 95 | s_Qk, 96 | s_qk, 97 | s_Rk, 98 | s_rk, 99 | s_Qkp1, 100 | s_qkp1, 101 | s_extra_temp, 102 | d_dynMem_const 103 | ); 104 | __syncthreads(); 105 | 106 | for(int i = thread_id; i < state_size; i+=num_threads){ 107 | d_c[i] = d_xu[i] - d_xs[i]; 108 | } 109 | glass::copy(states_sq, s_Qk, &d_G_dense[(states_sq+controls_sq)*k]); 110 | glass::copy(controls_sq, s_Rk, &d_G_dense[(states_sq+controls_sq)*k+states_sq]); 111 | glass::copy(states_sq, s_Qkp1, &d_G_dense[(states_sq+controls_sq)*(k+1)]); 112 | glass::copy(state_size, s_qk, &d_g[states_s_controls*k]); 113 | glass::copy(control_size, s_rk, &d_g[states_s_controls*k+state_size]); 114 | glass::copy(state_size, s_qkp1, &d_g[states_s_controls*(k+1)]); 115 | glass::copy(states_sq, static_cast(-1), s_Ak, &d_C_dense[(states_sq+states_p_controls)*k]); 116 | glass::copy(states_p_controls, static_cast(-1), s_Bk, &d_C_dense[(states_sq+states_p_controls)*k+states_sq]); 117 | glass::copy(state_size, s_integrator_error, &d_c[state_size*(k+1)]); 118 | 119 | } 120 | else{ // not last knot 121 | 122 | T *s_Ak = s_end; 123 | T *s_Bk = s_Ak + states_sq; 124 | T *s_integrator_error = s_Bk + states_p_controls; 125 | T *s_extra_temp = s_integrator_error + state_size; 126 | 127 | integratorAndGradient 131 | (state_size, control_size, 132 | s_xux, 133 | s_Ak, 134 | s_Bk, 135 | s_integrator_error, 136 | s_extra_temp, 137 | d_dynMem_const, 138 | timestep, 139 | block); 140 | __syncthreads(); 141 | 142 | gato_plant::trackingCostGradientAndHessian(state_size, 143 | control_size, 144 | s_xux, 145 | s_eePos_traj, 146 | s_Qk, 147 | s_qk, 148 | s_Rk, 149 | s_rk, 150 | s_extra_temp, 151 | d_dynMem_const); 152 | __syncthreads(); 153 | 154 | glass::copy(states_sq, s_Qk, &d_G_dense[(states_sq+controls_sq)*k]); 155 | glass::copy(controls_sq, s_Rk, &d_G_dense[(states_sq+controls_sq)*k+states_sq]); 156 | glass::copy(state_size, s_qk, &d_g[states_s_controls*k]); 157 | glass::copy(control_size, s_rk, &d_g[states_s_controls*k+state_size]); 158 | glass::copy(states_sq, static_cast(-1), s_Ak, &d_C_dense[(states_sq+states_p_controls)*k]); 159 | glass::copy(states_p_controls, static_cast(-1), s_Bk, &d_C_dense[(states_sq+states_p_controls)*k+states_sq]); 160 | glass::copy(state_size, s_integrator_error, &d_c[state_size*(k+1)]); 161 | } 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /include/common/merit.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "dynamics/rbd_plant.cuh" 6 | #include "integrator.cuh" 7 | 8 | //TODO: this 9 | template 10 | size_t get_merit_smem_size(uint32_t state_size, uint32_t control_size) 11 | { 12 | return sizeof(T) * (6 + (2 * state_size + control_size ) + 13 | ((int) 1.5 * state_size) + gato_plant::forwardDynamics_TempMemSize_Shared()); 14 | } 15 | 16 | // cost compute for line search 17 | template 18 | __global__ 19 | void ls_gato_compute_merit(uint32_t state_size, 20 | uint32_t control_size, 21 | uint32_t knot_points, 22 | T *d_xs, 23 | T *d_xu, 24 | T *d_eePos_traj, 25 | T mu, 26 | T dt, 27 | void *d_dynMem_const, 28 | T *d_dz, 29 | uint32_t alpha_multiplier, 30 | T *d_merits_out, 31 | T *d_merit_temp) 32 | { 33 | 34 | grid::robotModel *d_robotModel = (grid::robotModel *)d_dynMem_const; 35 | const cooperative_groups::thread_block block = cooperative_groups::this_thread_block(); 36 | const uint32_t thread_id = threadIdx.x; 37 | const uint32_t num_threads = blockDim.x; 38 | const uint32_t block_id = blockIdx.x; 39 | const uint32_t num_blocks = gridDim.x; 40 | 41 | const uint32_t states_s_controls = state_size + control_size; 42 | 43 | extern __shared__ T s_xux_k[]; 44 | 45 | T Jk, ck, pointmerit; 46 | 47 | T alpha = -1.0 / (1 << alpha_multiplier); // alpha sign 48 | T *s_eePos_k_traj = s_xux_k + 2*state_size+control_size; 49 | T *s_temp = s_eePos_k_traj + 6; 50 | 51 | 52 | for(unsigned knot = block_id; knot < knot_points; knot += num_blocks){ 53 | 54 | for(int i = thread_id; i < state_size+(knot < knot_points-1)*(states_s_controls); i+=num_threads){ 55 | s_xux_k[i] = d_xu[knot*states_s_controls+i] + alpha * d_dz[knot*states_s_controls+i]; 56 | if (i < 6){ 57 | s_eePos_k_traj[i] = d_eePos_traj[knot*6+i]; 58 | } 59 | } 60 | block.sync(); 61 | 62 | Jk = gato_plant::trackingcost(state_size, control_size, knot_points, s_xux_k, s_eePos_k_traj, s_temp, d_robotModel); 63 | 64 | block.sync(); 65 | if(knot < knot_points-1){ 66 | ck = integratorError(state_size, s_xux_k, &s_xux_k[states_s_controls], s_temp, d_robotModel, dt, block); 67 | } 68 | else{ 69 | // diff xs vs xs_traj 70 | for(int i = threadIdx.x; i < state_size; i++){ 71 | s_temp[i] = abs((d_xu[i] + alpha *d_dz[i]) - d_xs[i]); 72 | } 73 | block.sync(); 74 | glass::reduce(state_size, s_temp); 75 | block.sync(); 76 | ck = s_temp[0]; 77 | } 78 | block.sync(); 79 | 80 | if(thread_id == 0){ 81 | pointmerit = Jk + mu*ck; 82 | d_merit_temp[alpha_multiplier*knot_points+knot] = pointmerit; 83 | // printf("alpha: %f knot: %d reporting merit: %f\n", alpha, knot, pointmerit); 84 | } 85 | } 86 | cooperative_groups::this_grid().sync(); 87 | if(block_id == 0){ 88 | glass::reduce(knot_points, &d_merit_temp[alpha_multiplier*knot_points]); 89 | 90 | if(thread_id == 0){ 91 | d_merits_out[alpha_multiplier] = d_merit_temp[alpha_multiplier*knot_points]; 92 | } 93 | } 94 | } 95 | 96 | // zero merit out 97 | // shared mem size get_merit_smem_size() 98 | // cost compute for non line search 99 | template 100 | __global__ 101 | void compute_merit(uint32_t state_size, uint32_t control_size, uint32_t knot_points, T *d_xu, T *d_eePos_traj, T mu, T dt, void *d_dynMem_const, T *d_merit_out) 102 | { 103 | grid::robotModel *d_robotModel = (grid::robotModel *)d_dynMem_const; 104 | const cooperative_groups::thread_block block = cooperative_groups::this_thread_block(); 105 | const uint32_t thread_id = threadIdx.x; 106 | const uint32_t num_threads = blockDim.x; 107 | const uint32_t block_id = blockIdx.x; 108 | 109 | const uint32_t states_s_controls = state_size + control_size; 110 | extern __shared__ T s_xux_k[]; 111 | 112 | T Jk, ck, pointmerit; 113 | T *s_eePos_k_traj = s_xux_k + 2 * state_size + control_size; 114 | T *s_temp = s_eePos_k_traj + 6; 115 | 116 | for(unsigned knot = block_id; knot < knot_points; knot += gridDim.x){ 117 | 118 | for(int i = thread_id; i < state_size+(knot < knot_points-1)*(states_s_controls); i+=num_threads){ 119 | s_xux_k[i] = d_xu[knot*states_s_controls+i]; 120 | if (i < 6){ 121 | s_eePos_k_traj[i] = d_eePos_traj[knot*6+i]; 122 | } 123 | } 124 | 125 | block.sync(); 126 | Jk = gato_plant::trackingcost(state_size, control_size, knot_points, s_xux_k, s_eePos_k_traj, s_temp, d_robotModel); 127 | 128 | 129 | block.sync(); 130 | if(knot < knot_points-1){ 131 | ck = integratorError(state_size, s_xux_k, &s_xux_k[states_s_controls], s_temp, d_robotModel, dt, block); 132 | } 133 | else{ 134 | ck = 0; 135 | } 136 | block.sync(); 137 | 138 | if(thread_id == 0){ 139 | pointmerit = Jk + mu*ck; 140 | atomicAdd(d_merit_out, pointmerit); 141 | } 142 | } 143 | } 144 | -------------------------------------------------------------------------------- /include/common/settings.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | 4 | 5 | #ifndef KNOT_POINTS 6 | #define KNOT_POINTS 32 7 | #endif 8 | 9 | // default value is for iiwa arm 10 | #ifndef STATE_SIZE 11 | #define STATE_SIZE 14 12 | #endif 13 | 14 | 15 | /******************************************************************************* 16 | * Print Settings * 17 | *******************************************************************************/ 18 | 19 | 20 | #ifndef LIVE_PRINT_PATH 21 | #define LIVE_PRINT_PATH 0 22 | #endif 23 | 24 | #ifndef LIVE_PRINT_STATS 25 | #define LIVE_PRINT_STATS 0 26 | #endif 27 | 28 | /******************************************************************************* 29 | * Test Settings * 30 | *******************************************************************************/ 31 | 32 | 33 | #ifndef TEST_ITERS 34 | #define TEST_ITERS 1 35 | #endif 36 | 37 | #ifndef SAVE_DATA 38 | #define SAVE_DATA 0 39 | #endif 40 | 41 | #ifndef USE_DOUBLES 42 | #define USE_DOUBLES 0 43 | #endif 44 | 45 | #if USE_DOUBLES 46 | typedef double linsys_t; 47 | #else 48 | typedef float linsys_t; 49 | #endif 50 | 51 | /******************************************************************************* 52 | * MPC Settings * 53 | *******************************************************************************/ 54 | 55 | 56 | #ifndef CONST_UPDATE_FREQ 57 | #define CONST_UPDATE_FREQ 1 58 | #endif 59 | 60 | // runs sqp a bunch of times before starting to track 61 | #ifndef REMOVE_JITTERS 62 | #define REMOVE_JITTERS 1 63 | #endif 64 | 65 | // this constant controls when xu and goal will be shifted, should be a fraction of a timestep 66 | #ifndef SHIFT_THRESHOLD 67 | #define SHIFT_THRESHOLD (1 * timestep) 68 | #endif 69 | 70 | #ifndef SIMULATION_PERIOD 71 | #define SIMULATION_PERIOD 2000 72 | #endif 73 | 74 | #ifndef MERIT_THREADS 75 | #define MERIT_THREADS 128 76 | #endif 77 | 78 | // when enabled ABSOLUTE_QD_PENALTY penalizes qd like controls, rather than penalizing relative distance to precomputed traj 79 | #ifndef ABSOLUTE_QD_PENALTY 80 | #define ABSOLUTE_QD_PENALTY 0 81 | #endif 82 | 83 | 84 | #ifndef R_COST 85 | #if KNOT_POINTS == 64 86 | #define R_COST .001 87 | #else 88 | #define R_COST .0001 89 | #endif 90 | #endif 91 | 92 | #ifndef QD_COST 93 | #define QD_COST .0001 94 | #endif 95 | 96 | 97 | 98 | /******************************************************************************* 99 | * Linsys Settings * 100 | *******************************************************************************/ 101 | 102 | 103 | /* time_linsys = 1 to record linear system solve times. 104 | time_linsys = 0 to record number of sqp iterations. 105 | In both cases, the tracking error will also be recorded. */ 106 | 107 | #ifndef TIME_LINSYS 108 | #define TIME_LINSYS 1 109 | #endif 110 | 111 | #ifndef PCG_NUM_THREADS 112 | #define PCG_NUM_THREADS 128 113 | #endif 114 | 115 | 116 | /* LINSYS_SOLVE = 1 uses pcg as the underlying linear system solver 117 | LINSYS_SOLVE = 0 uses qdldl as the underlying linear system solver */ 118 | 119 | #ifndef LINSYS_SOLVE 120 | #define LINSYS_SOLVE 1 121 | #endif 122 | 123 | // Values found using experiments 124 | #ifndef PCG_MAX_ITER 125 | #if LINSYS_SOLVE 126 | #if KNOT_POINTS == 32 127 | #define PCG_MAX_ITER 173 128 | #elif KNOT_POINTS == 64 129 | #define PCG_MAX_ITER 167 130 | #elif KNOT_POINTS == 128 131 | #define PCG_MAX_ITER 167 132 | #elif KNOT_POINTS == 256 133 | #define PCG_MAX_ITER 118 134 | #elif KNOT_POINTS == 512 135 | #define PCG_MAX_ITER 67 136 | #else 137 | #define PCG_MAX_ITER 200 138 | #endif 139 | #else 140 | #define PCG_MAX_ITER -1 141 | #define PCG_EXIT_TOL -1 142 | #endif 143 | 144 | #endif 145 | 146 | 147 | /******************************************************************************* 148 | * SQP Settings * 149 | *******************************************************************************/ 150 | 151 | 152 | #if TIME_LINSYS == 1 153 | #define SQP_MAX_ITER 20 154 | typedef double toplevel_return_type; 155 | #else 156 | #define SQP_MAX_ITER 40 157 | typedef uint32_t toplevel_return_type; 158 | #endif 159 | 160 | 161 | #ifndef SQP_MAX_TIME_US 162 | #define SQP_MAX_TIME_US 2000 163 | #endif 164 | 165 | #ifndef SCHUR_THREADS 166 | #define SCHUR_THREADS 128 167 | #endif 168 | 169 | #ifndef DZ_THREADS 170 | #define DZ_THREADS 128 171 | #endif 172 | 173 | #ifndef KKT_THREADS 174 | #define KKT_THREADS 128 175 | #endif 176 | 177 | 178 | 179 | /******************************************************************************* 180 | * Rho Settings * 181 | *******************************************************************************/ 182 | 183 | 184 | 185 | #ifndef RHO_MIN 186 | #define RHO_MIN 1e-3 187 | #endif 188 | 189 | //TODO: get rid of rho in defines 190 | #ifndef RHO_FACTOR 191 | #define RHO_FACTOR 1.2 192 | #endif 193 | 194 | #ifndef RHO_MAX 195 | #define RHO_MAX 10 196 | #endif 197 | 198 | 199 | 200 | -------------------------------------------------------------------------------- /include/dynamics/iiwa/iiwa_eepos_plant.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | // // values assumed coming from an instance of grid 3 | // namespace grid{ 4 | // // 5 | // // TODO do I need all of these? 6 | // // 7 | 8 | // const int NUM_JOINTS = 30; 9 | // const int ID_DYNAMIC_SHARED_MEM_COUNT = 2340; 10 | // const int MINV_DYNAMIC_SHARED_MEM_COUNT = 9210; 11 | // const int FD_DYNAMIC_SHARED_MEM_COUNT = 10110; 12 | // const int ID_DU_DYNAMIC_SHARED_MEM_COUNT = 10980; 13 | // const int FD_DU_DYNAMIC_SHARED_MEM_COUNT = 10980; 14 | // const int ID_DU_MAX_SHARED_MEM_COUNT = 13410; 15 | // const int FD_DU_MAX_SHARED_MEM_COUNT = 16140; 16 | // const int SUGGESTED_THREADS = 512; 17 | 18 | // template 19 | // struct robotModel { 20 | // T *d_XImats; 21 | // int *d_topology_helpers; 22 | // }; 23 | // } 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include "iiwa_eepos_grid.cuh" 31 | #include "settings.cuh" 32 | 33 | #include "glass.cuh" 34 | 35 | // #include 36 | // #define RANDOM_MEAN 0 37 | // #define RANDOM_STDEV 0.001 38 | // std::default_random_engine randEng(time(0)); //seed 39 | // std::normal_distribution randDist(RANDOM_MEAN, RANDOM_STDEV); //mean followed by stdiv 40 | 41 | namespace gato_plant{ 42 | 43 | 44 | const unsigned SUGGESTED_THREADS = grid::SUGGESTED_THREADS; 45 | 46 | template 47 | __host__ __device__ 48 | constexpr T PI() {return static_cast(3.14159);} 49 | template 50 | __host__ __device__ 51 | constexpr T GRAVITY() {return static_cast(0.0);} 52 | 53 | 54 | // template 55 | // __host__ __device__ 56 | // constexpr T COST_Q1() {return static_cast(Q_COST);} 57 | 58 | template 59 | __host__ __device__ 60 | constexpr T COST_QD() {return static_cast(QD_COST);} 61 | 62 | template 63 | __host__ __device__ 64 | constexpr T COST_R() {return static_cast(R_COST);} 65 | 66 | template 67 | void *initializeDynamicsConstMem(){ 68 | grid::robotModel *d_robotModel = grid::init_robotModel(); 69 | return (void *)d_robotModel; 70 | } 71 | template 72 | void freeDynamicsConstMem(void *d_dynMem_const){ 73 | grid::free_robotModel((grid::robotModel*) d_dynMem_const); 74 | } 75 | 76 | // Start at q = [0,0,-0.25*PI,0,0.25*PI,0.5*PI,0] with small random for qd, u, lambda 77 | // template 78 | // __host__ 79 | // void loadInitialState(T *x){ 80 | // T q[7] = {PI(),0.25*PI(),0.167*PI(),-0.167*PI(),PI(),0.167*PI(),0.5*PI()}; 81 | // for (int i = 0; i < 7; i++){ 82 | // x[i] = q[i]; x[i + 7] = 0; 83 | // } 84 | // } 85 | 86 | // template 87 | // __host__ 88 | // void loadInitialControl(T *u){for (int i = 0; i < 7; i++){u[i] = 0;}} 89 | 90 | // // goal at q = [-0.5*PI,0.25*PI,0.167*PI,-0.167*PI,0.125*PI,0.167*PI,0.5*PI] with 0 for qd, u, lambda 91 | // template 92 | // __host__ 93 | // void loadGoalState(T *xg){ 94 | // T q[7] = {0,0,-0.25*PI(),0,0.25*PI(),0.5*PI(),0}; 95 | // for (int i = 0; i < 7; i++){ 96 | // xg[i] = q[i]; xg[i + 7] = static_cast(0); 97 | // } 98 | // } 99 | 100 | template 101 | __device__ 102 | void forwardDynamics(T *s_qdd, T *s_q, T *s_qd, T *s_u, T *s_XITemp, void *d_dynMem_const, cooperative_groups::thread_block block){ 103 | 104 | T *s_XImats = s_XITemp; T *s_temp = &s_XITemp[1008]; 105 | grid::load_update_XImats_helpers(s_XImats, s_q, (grid::robotModel *) d_dynMem_const, s_temp); 106 | __syncthreads(); 107 | 108 | grid::forward_dynamics_inner(s_qdd, s_q, s_qd, s_u, s_XImats, s_temp, gato_plant::GRAVITY()); 109 | 110 | // grid::forward_dynamics_device(s_qdd,s_q,s_qd,s_u,(grid::robotModel*)d_dynMem_const,GRAVITY()); 111 | } 112 | 113 | __host__ __device__ 114 | constexpr unsigned forwardDynamics_TempMemSize_Shared(){return grid::FD_DYNAMIC_SHARED_MEM_COUNT;} 115 | 116 | // template 117 | // __device__ 118 | // void forwardDynamicsGradient( T *s_dqdd, T *s_q, T *s_qd, T *s_u, T *s_temp, void *d_dynMem_const, cooperative_groups::thread_block block){ 119 | // grid::forward_dynamics_gradient_device(s_dqdd, s_q, s_qd, s_u, s_temp, (grid::robotModel *)d_dynMem_const,GRAVITY()); 120 | // } 121 | 122 | // __host__ __device__ 123 | // constexpr unsigned forwardDynamicsGradient_TempMemSize_Shared(){return grid::FD_DU_MAX_SHARED_MEM_COUNT;} 124 | 125 | 126 | template 127 | __device__ 128 | void forwardDynamicsAndGradient(T *s_df_du, T *s_qdd, const T *s_q, const T *s_qd, const T *s_u, T *s_temp_in, void *d_dynMem_const){ 129 | 130 | T *s_XITemp = s_temp_in; 131 | grid::robotModel *d_robotModel = (grid::robotModel *) d_dynMem_const; 132 | 133 | T *s_XImats = s_XITemp; T *s_vaf = &s_XITemp[504]; T *s_dc_du = &s_vaf[126]; T *s_Minv = &s_dc_du[98]; T *s_temp = &s_Minv[49]; 134 | grid::load_update_XImats_helpers(s_XImats, s_q, d_robotModel, s_temp); __syncthreads(); 135 | //TODO: there is a slightly faster way as s_v does not change -- thus no recompute needed 136 | grid::direct_minv_inner(s_Minv, s_q, s_XImats, s_temp); __syncthreads(); 137 | T *s_c = s_temp; 138 | grid::inverse_dynamics_inner(s_c, s_vaf, s_q, s_qd, s_XImats, &s_temp[7], GRAVITY()); __syncthreads(); 139 | grid::forward_dynamics_finish(s_qdd, s_u, s_c, s_Minv); __syncthreads(); 140 | grid::inverse_dynamics_inner_vaf(s_vaf, s_q, s_qd, s_qdd, s_XImats, s_temp, GRAVITY()); __syncthreads(); 141 | grid::inverse_dynamics_gradient_inner(s_dc_du, s_q, s_qd, s_vaf, s_XImats, s_temp, GRAVITY()); __syncthreads(); 142 | for(int ind = threadIdx.x + threadIdx.y*blockDim.x; ind < 98; ind += blockDim.x*blockDim.y){ 143 | int row = ind % 7; int dc_col_offset = ind - row; 144 | // account for the fact that Minv is an SYMMETRIC_UPPER triangular matrix 145 | T val = static_cast(0); 146 | for(int col = 0; col < 7; col++) { 147 | int index = (row <= col) * (col * 7 + row) + (row > col) * (row * 7 + col); 148 | val += s_Minv[index] * s_dc_du[dc_col_offset + col]; 149 | } 150 | s_df_du[ind] = -val; 151 | if (INCLUDE_DU && ind < 49){ 152 | int col = ind / 7; int index = (row <= col) * (col * 7 + row) + (row > col) * (row * 7 + col); 153 | s_df_du[ind + 98] = s_Minv[index]; 154 | } 155 | } 156 | } 157 | 158 | 159 | // template 160 | // __device__ 161 | // void forwardDynamicsAndGradient(T *s_dqdd, T *s_qdd, T *s_q, T *s_qd, T *s_u, T *s_temp_in, void *d_dynMem_const, cooperative_groups::thread_block block){ 162 | 163 | // grid::robotModel *d_robotModel = (grid::robotModel *) d_dynMem_const; 164 | 165 | // T *s_dc_du = s_temp_in; 166 | // T *s_vaf = s_dc_du + 392; 167 | // T *s_Minv = s_vaf + 252; 168 | // T *s_XITemp = s_Minv + 196; 169 | // T *s_XImats = s_XITemp; T *s_temp = &s_XITemp[1008]; 170 | 171 | 172 | // grid::load_update_XImats_helpers(s_XImats, s_q, d_robotModel, s_temp); 173 | 174 | // grid::direct_minv_inner(s_Minv, s_q, s_XImats, s_temp); 175 | // grid::inverse_dynamics_inner(s_temp, s_vaf, s_q, s_qd, s_XImats, &s_temp[14], GRAVITY()); 176 | // grid::forward_dynamics_finish(s_qdd, s_u, s_temp, s_Minv); 177 | 178 | // grid::inverse_dynamics_inner_vaf(s_vaf, s_q, s_qd, s_qdd, s_XImats, s_temp, GRAVITY()); 179 | // grid::inverse_dynamics_gradient_inner(s_dc_du, s_q, s_qd, s_vaf, s_XImats, s_temp, GRAVITY()); 180 | // for(int ind = threadIdx.x; ind < 392; ind += blockDim.x){ 181 | // int row = ind % 14; int dc_col_offset = ind - row; 182 | // // account for the fact that Minv is an SYMMETRIC_UPPER triangular matrix 183 | // T val = static_cast(0); 184 | // for(int col = 0; col < 14; col++) { 185 | // int index = (row <= col) * (col * 14 + row) + (row > col) * (row * 14 + col); 186 | // val += s_Minv[index] * s_dc_du[dc_col_offset + col]; 187 | // } 188 | // s_temp[ind] = -val; 189 | // } 190 | 191 | // for(int ind = threadIdx.x; ind < 392; ind += blockDim.x){ 192 | // s_dqdd[ind] = s_temp[ind]; 193 | // } 194 | // __syncthreads(); 195 | 196 | 197 | // T *s_XITemp = s_temp_in; 198 | // grid::robotModel *d_robotModel = (grid::robotModel *) d_dynMem_const; 199 | // T *s_XImats = s_XITemp; T *s_vaf = &s_XITemp[504]; T *s_dc_du = &s_vaf[126]; T *s_Minv = &s_dc_du[98]; T *s_temp = &s_Minv[49]; 200 | // grid::load_update_XImats_helpers(s_XImats, s_q, d_robotModel, s_temp); __syncthreads(); 201 | // //TODO: there is a slightly faster way as s_v does not change -- thus no recompute needed 202 | // grid::direct_minv_inner(s_Minv, s_q, s_XImats, s_temp); __syncthreads(); 203 | // T *s_c = s_temp; 204 | // grid::inverse_dynamics_inner(s_c, s_vaf, s_q, s_qd, s_XImats, &s_temp[7], GRAVITY()); __syncthreads(); 205 | // grid::forward_dynamics_finish(s_qdd, s_u, s_c, s_Minv); __syncthreads(); 206 | // grid::inverse_dynamics_inner_vaf(s_vaf, s_q, s_qd, s_qdd, s_XImats, s_temp, GRAVITY()); __syncthreads(); 207 | // grid::inverse_dynamics_gradient_inner(s_dc_du, s_q, s_qd, s_vaf, s_XImats, s_temp, GRAVITY()); __syncthreads(); 208 | // for(int ind = threadIdx.x + threadIdx.y*blockDim.x; ind < 98; ind += blockDim.x*blockDim.y){ 209 | // int row = ind % 7; int dc_col_offset = ind - row; 210 | // // account for the fact that Minv is an SYMMETRIC_UPPER triangular matrix 211 | // T val = static_cast(0); 212 | // for(int col = 0; col < 7; col++) { 213 | // int index = (row <= col) * (col * 7 + row) + (row > col) * (row * 7 + col); 214 | // val += s_Minv[index] * s_dc_du[dc_col_offset + col]; 215 | // } 216 | // s_dqdd[ind] = -val; 217 | // if (1 && ind < 49){ 218 | // int col = ind / 7; int index = (row <= col) * (col * 7 + row) + (row > col) * (row * 7 + col); 219 | // s_dqdd[ind + 98] = s_Minv[index]; 220 | // } 221 | // } 222 | 223 | 224 | 225 | // grid::robotModel *d_robotModel = (grid::robotModel *) d_dynMem_const; 226 | // grid::forward_dynamics_gradient_device(s_dqdd, s_q, s_qd, s_u, d_robotModel, GRAVITY()); 227 | // } 228 | 229 | 230 | __host__ __device__ 231 | constexpr unsigned forwardDynamicsAndGradient_TempMemSize_Shared(){return grid::FD_DU_MAX_SHARED_MEM_COUNT;} 232 | 233 | 234 | __host__ 235 | unsigned trackingcost_TempMemCt_Shared(uint32_t state_size, uint32_t control_size, uint32_t knot_points){ 236 | return state_size/2 + control_size + 3 + 6 + grid::EE_POS_SHARED_MEM_COUNT; 237 | } 238 | 239 | ///TODO: get rid of divergence 240 | template 241 | __device__ 242 | T trackingcost(uint32_t state_size, uint32_t control_size, uint32_t knot_points, T *s_xu, T *s_eePos_traj, T *s_temp, const grid::robotModel *d_robotModel){ 243 | 244 | // const T Q_cost = COST_Q1(); 245 | const T QD_cost = COST_QD(); 246 | const T R_cost = COST_R(); 247 | 248 | T err; 249 | T val = 0; 250 | 251 | // QD and R penalty 252 | const uint32_t threadsNeeded = state_size/2 + control_size * (blockIdx.x < knot_points - 1); 253 | 254 | T *s_cost_vec = s_temp; 255 | T *s_eePos_cost = s_cost_vec + threadsNeeded + 3; 256 | T *s_extra_temp = s_eePos_cost + 6; 257 | 258 | 259 | 260 | 261 | for(int i = threadIdx.x; i < threadsNeeded; i += blockDim.x){ 262 | if(i < state_size/2){ 263 | err = s_xu[i + state_size/2]; 264 | val = QD_cost * err * err; 265 | } 266 | else{ 267 | err = s_xu[i+state_size/2]; 268 | val = R_cost * err * err; 269 | } 270 | s_cost_vec[i] = static_cast(0.5) * val; 271 | } 272 | 273 | __syncthreads(); 274 | grid::end_effector_positions_device(s_eePos_cost, s_xu, s_extra_temp, d_robotModel); 275 | __syncthreads(); 276 | 277 | // if(threadIdx.x==0){ 278 | // printf("block %d with input %f,%f,%f,%f,%f,%f,%f\n", blockIdx.x, s_xu[7],s_xu[8],s_xu[9],s_xu[10],s_xu[11],s_xu[12],s_xu[13]); 279 | // } 280 | 281 | for(int i = threadIdx.x; i < 3; i+=blockDim.x){ 282 | err = s_eePos_cost[i] - s_eePos_traj[i]; 283 | s_cost_vec[threadsNeeded + i] = static_cast(0.5) * err * err; 284 | } 285 | __syncthreads(); 286 | glass::reduce(3 + threadsNeeded, s_cost_vec); 287 | __syncthreads(); 288 | 289 | return s_cost_vec[0]; 290 | } 291 | 292 | 293 | ///TODO: costgradientandhessian could be much faster with no divergence 294 | // not last block 295 | template 296 | __device__ 297 | void trackingCostGradientAndHessian(uint32_t state_size, 298 | uint32_t control_size, 299 | T *s_xu, 300 | T *s_eePos_traj, 301 | T *s_Qk, 302 | T *s_qk, 303 | T *s_Rk, 304 | T *s_rk, 305 | T *s_temp, 306 | void *d_robotModel) 307 | { 308 | // const T Q_cost = COST_Q1(); 309 | const T QD_cost = COST_QD(); 310 | const T R_cost = COST_R(); 311 | 312 | T *s_eePos = s_temp; 313 | T *s_eePos_grad = s_eePos + 6; 314 | T *s_scratch = s_eePos_grad + 6 * state_size/2; 315 | 316 | const uint32_t threads_needed = state_size + control_size*computeR; 317 | uint32_t offset; 318 | T x_err, y_err, z_err, err; 319 | 320 | grid::end_effector_positions_device(s_eePos, s_xu, s_scratch, (grid::robotModel *)d_robotModel); 321 | __syncthreads(); 322 | grid::end_effector_positions_gradient_device(s_eePos_grad, s_xu, s_scratch, (grid::robotModel *)d_robotModel); 323 | __syncthreads(); 324 | 325 | // if(threadIdx.x==0){ 326 | // printf("block %d with input %f,%f,%f,%f,%f,%f,%f\n", blockIdx.x, s_xu[0],s_xu[1],s_xu[2],s_xu[3],s_xu[4],s_xu[5],s_xu[6]); 327 | // } 328 | 329 | for (int i = threadIdx.x; i < threads_needed; i += blockDim.x){ 330 | 331 | if(i < state_size){ 332 | //gradient 333 | if (i < state_size / 2){ 334 | // sum x, y, z error 335 | x_err = (s_eePos[0] - s_eePos_traj[0]); 336 | y_err = (s_eePos[1] - s_eePos_traj[1]); 337 | z_err = (s_eePos[2] - s_eePos_traj[2]); 338 | 339 | s_qk[i] = s_eePos_grad[6 * i + 0] * x_err + s_eePos_grad[6 * i + 1] * y_err + s_eePos_grad[6 * i + 2] * z_err; 340 | } 341 | else{ 342 | err = s_xu[i]; 343 | s_qk[i] = QD_cost * err; 344 | } 345 | 346 | } 347 | else{ 348 | err = s_xu[i]; 349 | offset = i - state_size; 350 | 351 | //gradient 352 | s_rk[offset] = R_cost * err; 353 | } 354 | } 355 | 356 | __syncthreads(); 357 | 358 | for (int i = threadIdx.x; i < threads_needed; i += blockDim.x){ 359 | if (i < state_size){ 360 | //hessian 361 | for(int j = 0; j < state_size; j++){ 362 | if(j < state_size / 2 && i < state_size / 2){ 363 | s_Qk[i*state_size + j] = s_qk[i] * s_qk[j]; 364 | } 365 | else{ 366 | s_Qk[i*state_size + j] = (i == j) ? QD_cost : static_cast(0); 367 | } 368 | } 369 | } 370 | else{ 371 | offset = i - state_size; 372 | //hessian 373 | for(int j = 0; j < control_size; j++){ 374 | s_Rk[offset*control_size+j] = (offset == j) ? R_cost : static_cast(0); 375 | } 376 | } 377 | } 378 | } 379 | 380 | // last block 381 | template 382 | __device__ 383 | void trackingCostGradientAndHessian_lastblock(uint32_t state_size, 384 | uint32_t control_size, 385 | T *s_xux, 386 | T *s_eePos_traj, 387 | T *s_Qk, 388 | T *s_qk, 389 | T *s_Rk, 390 | T *s_rk, 391 | T *s_Qkp1, 392 | T *s_qkp1, 393 | T *s_temp, 394 | void *d_dynMem_const 395 | ) 396 | { 397 | trackingCostGradientAndHessian(state_size, control_size, s_xux, s_eePos_traj, s_Qk, s_qk, s_Rk, s_rk, s_temp, d_dynMem_const); 398 | __syncthreads(); 399 | trackingCostGradientAndHessian(state_size, control_size, s_xux, &s_eePos_traj[6], s_Qkp1, s_qkp1, nullptr, nullptr, s_temp, d_dynMem_const); 400 | __syncthreads(); 401 | } 402 | 403 | // __host__ __device__ 404 | // constexpr unsigned costGradientAndHessian_TempMemSize_Shared(){return 0;} 405 | } 406 | 407 | -------------------------------------------------------------------------------- /include/dynamics/iiwa/iiwa_plant.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | // // values assumed coming from an instance of grid 3 | // namespace grid{ 4 | // // 5 | // // TODO do I need all of these? 6 | // // 7 | 8 | // const int NUM_JOINTS = 30; 9 | // const int ID_DYNAMIC_SHARED_MEM_COUNT = 2340; 10 | // const int MINV_DYNAMIC_SHARED_MEM_COUNT = 9210; 11 | // const int FD_DYNAMIC_SHARED_MEM_COUNT = 10110; 12 | // const int ID_DU_DYNAMIC_SHARED_MEM_COUNT = 10980; 13 | // const int FD_DU_DYNAMIC_SHARED_MEM_COUNT = 10980; 14 | // const int ID_DU_MAX_SHARED_MEM_COUNT = 13410; 15 | // const int FD_DU_MAX_SHARED_MEM_COUNT = 16140; 16 | // const int SUGGESTED_THREADS = 512; 17 | 18 | // template 19 | // struct robotModel { 20 | // T *d_XImats; 21 | // int *d_topology_helpers; 22 | // }; 23 | // } 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include "iiwa_grid.cuh" 31 | #include "../settings.cuh" 32 | 33 | #include "glass.cuh" 34 | 35 | // #include 36 | // #define RANDOM_MEAN 0 37 | // #define RANDOM_STDEV 0.001 38 | // std::default_random_engine randEng(time(0)); //seed 39 | // std::normal_distribution randDist(RANDOM_MEAN, RANDOM_STDEV); //mean followed by stdiv 40 | 41 | namespace gato_plant{ 42 | 43 | 44 | const unsigned SUGGESTED_THREADS = grid::SUGGESTED_THREADS; 45 | 46 | template 47 | __host__ __device__ 48 | constexpr T PI() {return static_cast(3.14159);} 49 | template 50 | __host__ __device__ 51 | constexpr T GRAVITY() {return static_cast(0.0);} 52 | 53 | 54 | template 55 | __host__ __device__ 56 | constexpr T COST_Q1() {return static_cast(Q_COST);} 57 | 58 | template 59 | __host__ __device__ 60 | constexpr T COST_QD() {return static_cast(QD_COST);} 61 | 62 | template 63 | __host__ __device__ 64 | constexpr T COST_R() {return static_cast(R_COST);} 65 | 66 | template 67 | void *initializeDynamicsConstMem(){ 68 | grid::robotModel *d_robotModel = grid::init_robotModel(); 69 | return (void *)d_robotModel; 70 | } 71 | template 72 | void freeDynamicsConstMem(void *d_dynMem_const){ 73 | grid::free_robotModel((grid::robotModel*) d_dynMem_const); 74 | } 75 | 76 | // Start at q = [0,0,-0.25*PI,0,0.25*PI,0.5*PI,0] with small random for qd, u, lambda 77 | template 78 | __host__ 79 | void loadInitialState(T *x){ 80 | T q[7] = {PI(),0.25*PI(),0.167*PI(),-0.167*PI(),PI(),0.167*PI(),0.5*PI()}; 81 | for (int i = 0; i < 7; i++){ 82 | x[i] = q[i]; x[i + 7] = 0; 83 | } 84 | } 85 | 86 | template 87 | __host__ 88 | void loadInitialControl(T *u){for (int i = 0; i < 7; i++){u[i] = 0;}} 89 | 90 | // goal at q = [-0.5*PI,0.25*PI,0.167*PI,-0.167*PI,0.125*PI,0.167*PI,0.5*PI] with 0 for qd, u, lambda 91 | template 92 | __host__ 93 | void loadGoalState(T *xg){ 94 | T q[7] = {0,0,-0.25*PI(),0,0.25*PI(),0.5*PI(),0}; 95 | for (int i = 0; i < 7; i++){ 96 | xg[i] = q[i]; xg[i + 7] = static_cast(0); 97 | } 98 | } 99 | 100 | template 101 | __device__ 102 | void forwardDynamics(T *s_qdd, T *s_q, T *s_qd, T *s_u, T *s_temp, void *d_dynMem_const, cooperative_groups::thread_block block){ 103 | grid::forward_dynamics_device(s_qdd,s_q,s_qd,s_u,s_temp,(grid::robotModel*)d_dynMem_const,GRAVITY()); 104 | } 105 | 106 | __host__ __device__ 107 | constexpr unsigned forwardDynamics_TempMemSize_Shared(){return grid::FD_DYNAMIC_SHARED_MEM_COUNT;} 108 | 109 | template 110 | __device__ 111 | void forwardDynamicsGradient( T *s_dqdd, T *s_q, T *s_qd, T *s_u, T *s_temp, void *d_dynMem_const, cooperative_groups::thread_block block){ 112 | grid::forward_dynamics_gradient_device(s_dqdd, s_q, s_qd, s_u, s_temp, (grid::robotModel *)d_dynMem_const,GRAVITY()); 113 | } 114 | 115 | __host__ __device__ 116 | constexpr unsigned forwardDynamicsGradient_TempMemSize_Shared(){return grid::FD_DU_MAX_SHARED_MEM_COUNT_new_version;} 117 | 118 | template 119 | __device__ 120 | void forwardDynamicsAndGradient(T *s_dqdd, T *s_qdd, T *s_q, T *s_qd, T *s_u, T *s_temp, void *d_dynMem_const, cooperative_groups::thread_block block){ 121 | grid::forward_dynamics_and_gradient_device(s_dqdd, s_qdd, s_q, s_qd, s_u, s_temp, (grid::robotModel *)d_dynMem_const,GRAVITY()); 122 | } 123 | 124 | 125 | __host__ __device__ 126 | constexpr unsigned forwardDynamicsAndGradient_TempMemSize_Shared(){return grid::FD_DU_MAX_SHARED_MEM_COUNT_new_version;} 127 | 128 | 129 | ///TODO: get rid of divergence 130 | template 131 | __device__ 132 | T trackingcost(uint32_t state_size, uint32_t control_size, uint32_t knot_points, T *s_xux, T *s_xux_traj, T *s_temp, cooperative_groups::thread_group g = cooperative_groups::this_thread_block()){ 133 | 134 | 135 | const uint32_t threadsNeeded = state_size + control_size * (blockIdx.x != knot_points - 1); 136 | const T Q_cost = COST_Q1(); 137 | const T QD_cost = COST_QD(); 138 | const T R_cost = COST_R(); 139 | 140 | T err, val; 141 | 142 | 143 | for(int i = threadIdx.x; i < threadsNeeded; i += blockDim.x){ 144 | if(i < state_size){ 145 | if(i < state_size / 2){ 146 | err = s_xux[i] - s_xux_traj[i]; 147 | val = Q_cost * err * err; 148 | } 149 | else{ 150 | 151 | #if ABSOLUTE_QD_PENALTY 152 | err = s_xux[i]; 153 | #else 154 | err = s_xux[i] - s_xux_traj[i]; 155 | #endif 156 | val = QD_cost * err * err; 157 | } 158 | 159 | } 160 | else{ 161 | err = s_xux[i]; 162 | val = R_cost * err * err; 163 | } 164 | s_temp[i] = static_cast(0.5) * val; 165 | } 166 | 167 | g.sync(); 168 | glass::reduce(threadsNeeded, s_temp); 169 | g.sync(); 170 | return s_temp[0]; 171 | } 172 | 173 | 174 | ///TODO: costgradientandhessian could be much faster with no divergence 175 | // not last block 176 | template 177 | __device__ 178 | void trackingCostGradientAndHessian(uint32_t state_size, 179 | uint32_t control_size, 180 | T *s_xu, 181 | T *s_xu_traj, 182 | T *s_Qk, 183 | T *s_qk, 184 | T *s_Rk, 185 | T *s_rk, 186 | uint32_t block_id, 187 | cooperative_groups::thread_group g) 188 | { 189 | const uint32_t threadsNeeded = state_size + control_size; 190 | const T Q_cost = COST_Q1(); 191 | const T QD_cost = COST_QD(); 192 | const T R_cost = COST_R(); 193 | 194 | uint32_t offset; 195 | T err; 196 | 197 | for (int i = g.thread_rank(); i < threadsNeeded; i += g.size()){ 198 | 199 | 200 | 201 | if(i < state_size){ 202 | //gradient 203 | if (i < state_size / 2){ 204 | err = s_xu[i] - s_xu_traj[i]; 205 | s_qk[i] = Q_cost * err; 206 | } 207 | else{ 208 | #if ABSOLUTE_QD_PENALTY 209 | err = s_xu[i]; 210 | #else 211 | err = s_xu[i] - s_xu_traj[i]; 212 | #endif 213 | s_qk[i] = QD_cost * err; 214 | } 215 | 216 | //hessian 217 | for(int j = 0; j < state_size; j++){ 218 | if(j < state_size / 2){ 219 | s_Qk[i*state_size + j] = (i == j) ? Q_cost : static_cast(0); 220 | } 221 | else{ 222 | s_Qk[i*state_size + j] = (i == j) ? QD_cost : static_cast(0); 223 | } 224 | } 225 | } 226 | else{ 227 | 228 | err = s_xu[i]; 229 | offset = i - state_size; 230 | 231 | //gradient 232 | s_rk[offset] = R_cost * err; 233 | 234 | //hessian 235 | for(int j = 0; j < control_size; j++){ 236 | s_Rk[offset*control_size+j] = (offset == j) ? R_cost : static_cast(0); 237 | } 238 | } 239 | } 240 | } 241 | 242 | // last block 243 | template 244 | __device__ 245 | void trackingCostGradientAndHessian_lastblock(uint32_t state_size, 246 | uint32_t control_size, 247 | T *s_xux, 248 | T *s_xux_traj, 249 | T *s_Qk, 250 | T *s_qk, 251 | T *s_Rk, 252 | T *s_rk, 253 | T *s_Qkp1, 254 | T *s_qkp1, 255 | uint32_t block_id, 256 | cooperative_groups::thread_group g) 257 | { 258 | unsigned threadsNeeded = 2*state_size + control_size; 259 | const T Q_cost = COST_Q1(); 260 | const T QD_cost = COST_QD(); 261 | const T R_cost = COST_R(); 262 | 263 | T err; 264 | uint32_t offset; 265 | 266 | for (int i = g.thread_rank(); i < threadsNeeded; i += g.size()){ 267 | 268 | if (i < state_size){ 269 | if(i < state_size / 2){ 270 | err = s_xux[i] - s_xux_traj[i]; 271 | s_qk[i] = Q_cost * err; 272 | } 273 | else{ 274 | #if ABSOLUTE_QD_PENALTY 275 | err = s_xux[i]; 276 | #else 277 | err = s_xux[i] - s_xux_traj[i]; 278 | #endif 279 | s_qk[i] = QD_cost * err; 280 | } 281 | 282 | for(int j = 0; j < state_size; j++){ 283 | if(j < state_size / 2){ 284 | s_Qk[i*state_size + j] = (i == j) ? Q_cost : static_cast(0); 285 | } 286 | else{ 287 | s_Qk[i*state_size + j] = (i == j) ? QD_cost : static_cast(0); 288 | } 289 | } 290 | } 291 | else if(i < state_size + control_size){ 292 | err = s_xux[i]; 293 | offset = i - state_size; 294 | s_rk[offset] = R_cost * err; 295 | 296 | for(int j = 0; j < control_size; j++){ 297 | s_Rk[offset*control_size + j] = (offset == j) ? R_cost : static_cast(0); 298 | } 299 | } 300 | else{ 301 | offset = i - state_size - control_size; 302 | if(offset < state_size / 2){ 303 | err = s_xux[i] - s_xux_traj[i]; 304 | s_qkp1[offset] = Q_cost * err; 305 | } 306 | else{ 307 | #if ABSOLUTE_QD_PENALTY 308 | err = s_xux[i]; 309 | #else 310 | err = s_xux[i] - s_xux_traj[i]; 311 | #endif 312 | s_qkp1[offset] = QD_cost * err; 313 | } 314 | 315 | 316 | for(int j = 0; j < state_size; j++){ 317 | if(j < state_size / 2){ 318 | s_Qkp1[offset*state_size+j] = (offset == j) ? Q_cost : static_cast(0); 319 | } 320 | else{ 321 | s_Qkp1[offset*state_size+j] = (offset == j) ? QD_cost : static_cast(0); 322 | } 323 | } 324 | 325 | } 326 | } 327 | } 328 | 329 | __host__ __device__ 330 | constexpr unsigned costGradientAndHessian_TempMemSize_Shared(){return 0;} 331 | } 332 | 333 | -------------------------------------------------------------------------------- /include/dynamics/rbd_plant.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // #include "iiwa_plant.cuh" 4 | 5 | #include "iiwa/iiwa_eepos_plant.cuh" -------------------------------------------------------------------------------- /include/mpcsim.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "integrator.cuh" 17 | #include "settings.cuh" 18 | #include "utils/experiment.cuh" 19 | #include "gpuassert.cuh" 20 | 21 | #if LINSYS_SOLVE == 1 22 | #include "pcg/sqp.cuh" 23 | #else 24 | #include "qdldl/sqp.cuh" 25 | #endif 26 | 27 | 28 | 29 | template 30 | __global__ 31 | void compute_tracking_error_kernel(T *d_tracking_error, uint32_t state_size, T *d_xu_goal, T *d_xs){ 32 | 33 | T err; 34 | 35 | for(int ind = threadIdx.x; ind < state_size/2; ind += blockDim.x){ 36 | err = abs(d_xs[ind] - d_xu_goal[ind]); 37 | atomicAdd(d_tracking_error, err); 38 | } 39 | } 40 | 41 | 42 | template 43 | T compute_tracking_error(uint32_t state_size, T *d_xu_goal, T *d_xs){ 44 | 45 | T h_tracking_error = 0.0f; 46 | T *d_tracking_error; 47 | gpuErrchk(cudaMalloc(&d_tracking_error, sizeof(T))); 48 | gpuErrchk(cudaMemcpy(d_tracking_error, &h_tracking_error, sizeof(T), cudaMemcpyHostToDevice)); 49 | 50 | compute_tracking_error_kernel<<<1,32>>>(d_tracking_error, state_size, d_xu_goal, d_xs); 51 | 52 | gpuErrchk(cudaMemcpy(&h_tracking_error, d_tracking_error, sizeof(T), cudaMemcpyDeviceToHost)); 53 | gpuErrchk(cudaFree(d_tracking_error)); 54 | return h_tracking_error; 55 | } 56 | 57 | 58 | template 59 | void dump_tracking_data(std::vector *pcg_iters, std::vector *pcg_exits, std::vector *linsys_times, std::vector *sqp_times, std::vector *sqp_iters, 60 | std::vector *sqp_exits, std::vector *tracking_errors, std::vector> *tracking_path, uint32_t timesteps_taken, 61 | uint32_t control_updates_taken, uint32_t start_state_ind, uint32_t goal_state_ind, uint32_t test_iter, 62 | std::string filename_prefix){ 63 | // Helper function to create file names 64 | auto createFileName = [&](const std::string& data_type) { 65 | std::string filename = filename_prefix + "_" + std::to_string(test_iter) + "_" + data_type + ".result"; 66 | return filename; 67 | }; 68 | 69 | // Helper function to dump single-dimension vector data 70 | auto dumpVectorData = [&](const auto& data, const std::string& data_type) { 71 | std::ofstream file(createFileName(data_type)); 72 | if (!file.is_open()) { 73 | std::cerr << "Failed to open " << data_type << " file.\n"; 74 | return; 75 | } 76 | for (const auto& item : *data) { 77 | file << item << '\n'; 78 | } 79 | file.close(); 80 | }; 81 | 82 | // Dump single-dimension vector data 83 | dumpVectorData(pcg_iters, "pcg_iters"); 84 | dumpVectorData(linsys_times, "linsys_times"); 85 | dumpVectorData(sqp_times, "sqp_times"); 86 | dumpVectorData(sqp_iters, "sqp_iters"); 87 | dumpVectorData(sqp_exits, "sqp_exits"); 88 | dumpVectorData(tracking_errors, "tracking_errors"); 89 | dumpVectorData(pcg_exits, "pcg_exits"); 90 | 91 | 92 | // Dump two-dimension vector data (tracking_path) 93 | std::ofstream file(createFileName("tracking_path")); 94 | if (!file.is_open()) { 95 | std::cerr << "Failed to open tracking_path file.\n"; 96 | return; 97 | } 98 | for (const auto& outerItem : *tracking_path) { 99 | for (const auto& innerItem : outerItem) { 100 | file << innerItem << ','; 101 | } 102 | file << '\n'; 103 | } 104 | file.close(); 105 | 106 | std::ofstream statsfile(createFileName("stats")); 107 | if (!statsfile.is_open()) { 108 | std::cerr << "Failed to open stats file.\n"; 109 | return; 110 | } 111 | statsfile << "timesteps: " << timesteps_taken << "\n"; 112 | statsfile << "control_updates: " << control_updates_taken << "\n"; 113 | // printStatsToFile(&linsys_times, ) 114 | 115 | statsfile.close(); 116 | } 117 | 118 | 119 | void print_test_config(){ 120 | std::cout << "Knot points: " << KNOT_POINTS << "\n"; 121 | std::cout << "State size: " << STATE_SIZE << "\n"; 122 | std::cout << "Datatype: " << (USE_DOUBLES ? "DOUBLE" : "FLOAT") << "\n"; 123 | std::cout << "Sqp exits condition: " << (CONST_UPDATE_FREQ ? "CONSTANT TIME" : "CONSTANT ITERS") << "\n"; 124 | std::cout << "QD COST: " << QD_COST << "\n"; 125 | std::cout << "R COST: " << R_COST << "\n"; 126 | std::cout << "Rho factor: " << RHO_FACTOR << "\n"; 127 | std::cout << "Rho max: " << RHO_MAX << "\n"; 128 | std::cout << "Test iters: " << TEST_ITERS << "\n"; 129 | #if CONST_UPDATE_FREQ 130 | std::cout << "Max sqp time: " << SQP_MAX_TIME_US << "\n"; 131 | #else 132 | std::cout << "Max sqp iter: " << SQP_MAX_ITER << "\n"; 133 | #endif 134 | std::cout << "Solver: " << ( (LINSYS_SOLVE == 1) ? "PCG" : "QDLDL") << "\n"; 135 | #if LINSYS_SOLVE == 1 136 | std::cout << "Max pcg iter: " << PCG_MAX_ITER << "\n"; 137 | // std::cout << "pcg exit tol: " << PCG_EXIT_TOL << "\n"; 138 | #endif 139 | std::cout << "Save data: " << (SAVE_DATA ? "ON" : "OFF") << "\n"; 140 | std::cout << "Jitters: " << (REMOVE_JITTERS ? "ON" : "OFF") << "\n"; 141 | 142 | std::cout << "\n\n"; 143 | } 144 | 145 | 146 | template 147 | std::tuple, std::vector, linsys_t> simulateMPC(const uint32_t state_size, const uint32_t control_size, const uint32_t knot_points, const uint32_t traj_steps, 148 | float timestep, T *d_eePos_traj, T *d_xu_traj, T *d_xs, uint32_t start_state_ind, uint32_t goal_state_ind, uint32_t test_iter, T linsys_exit_tol, 149 | std::string test_output_prefix){ 150 | 151 | const uint32_t traj_len = (state_size+control_size)*knot_points-control_size; 152 | 153 | const T shift_threshold = SHIFT_THRESHOLD; 154 | const int max_control_updates = 100000; 155 | 156 | 157 | // struct timespec solve_start, solve_end; 158 | double sqp_solve_time_us = 0; // current sqp solve time 159 | double simulation_time = 0; // current simulation time 160 | double prev_simulation_time = 0; // last simulation time 161 | double time_since_timestep = 0; // time since last timestep of original trajectory 162 | bool shifted = false; // has xu been shifted 163 | uint32_t traj_offset = 0; // current goal states of original trajectory 164 | 165 | 166 | // vars for recording data 167 | std::vector> tracking_path; // list of traversed traj 168 | std::vector linsys_iters; 169 | std::vector linsys_times; 170 | std::vector sqp_times; 171 | std::vector sqp_iters; 172 | std::vector sqp_exits; 173 | std::vector linsys_exits; 174 | std::vector tracking_errors; 175 | std::vector cur_linsys_iters; 176 | std::vector cur_linsys_exits; 177 | std::vector cur_linsys_times; 178 | std::tuple, std::vector, double, uint32_t, bool, std::vector> sqp_stats; 179 | uint32_t cur_sqp_iters; 180 | T cur_tracking_error; 181 | int control_update_step; 182 | 183 | 184 | // mpc iterates 185 | T *d_lambda, *d_eePos_goal, *d_xu, *d_xu_old; 186 | gpuErrchk(cudaMalloc(&d_lambda, state_size*knot_points*sizeof(T))); 187 | gpuErrchk(cudaMalloc(&d_xu, traj_len*sizeof(T))); 188 | gpuErrchk(cudaMalloc(&d_xu_old, traj_len*sizeof(T))); 189 | gpuErrchk(cudaMalloc(&d_eePos_goal, 6*knot_points*sizeof(T))); 190 | gpuErrchk(cudaMemset(d_lambda, 0, state_size*knot_points*sizeof(T))); 191 | gpuErrchk(cudaMemcpy(d_eePos_goal, d_eePos_traj, 6*knot_points*sizeof(T), cudaMemcpyDeviceToDevice)); 192 | gpuErrchk(cudaMemcpy(d_xu_old, d_xu_traj, traj_len*sizeof(T), cudaMemcpyDeviceToDevice)); 193 | gpuErrchk(cudaMemcpy(d_xu, d_xu_traj, traj_len*sizeof(T), cudaMemcpyDeviceToDevice)); 194 | 195 | 196 | void *d_dynmem = gato_plant::initializeDynamicsConstMem(); 197 | 198 | 199 | // temp host memory 200 | T h_xs[state_size]; 201 | gpuErrchk(cudaMemcpy(h_xs, d_xs, state_size*sizeof(T), cudaMemcpyDeviceToHost)); 202 | tracking_path.push_back(std::vector(h_xs, &h_xs[state_size])); 203 | gpuErrchk(cudaPeekAtLastError()); 204 | T h_eePos[6]; 205 | T h_eePos_goal[6]; 206 | 207 | 208 | // temp device memory 209 | T *d_eePos; 210 | gpuErrchk(cudaMalloc(&d_eePos, 6*sizeof(T))); 211 | 212 | #if LINSYS_SOLVE == 1 213 | pcg_config config; 214 | config.pcg_block = PCG_NUM_THREADS; 215 | config.pcg_exit_tol = linsys_exit_tol; 216 | config.pcg_max_iter = PCG_MAX_ITER; 217 | #endif 218 | 219 | T rho = 1e-3; 220 | T rho_reset = 1e-3; 221 | 222 | #if REMOVE_JITTERS 223 | #if LINSYS_SOLVE == 1 224 | config.pcg_exit_tol = 1e-11; 225 | config.pcg_max_iter = 10000; 226 | 227 | for(int j = 0; j < 100; j++){ 228 | sqpSolvePcg(state_size, control_size, knot_points, timestep, d_eePos_goal, d_lambda, d_xu, d_dynmem, config, rho, 1e-3); 229 | gpuErrchk(cudaMemcpy(d_xu, d_xu_traj, traj_len*sizeof(T), cudaMemcpyDeviceToDevice)); 230 | } 231 | rho = 1e-3; 232 | config.pcg_exit_tol = linsys_exit_tol; 233 | config.pcg_max_iter = PCG_MAX_ITER; 234 | #else 235 | for(int j = 0; j < 100; j++){ 236 | sqpSolveQdldl(state_size, control_size, knot_points, timestep, d_eePos_goal, d_lambda, d_xu, d_dynmem, rho, 1e-3); 237 | gpuErrchk(cudaMemcpy(d_xu, d_xu_traj, traj_len*sizeof(T), cudaMemcpyDeviceToDevice)); 238 | } 239 | rho = 1e-3; 240 | #endif 241 | 242 | #endif // #if REMOVE_JITTERS 243 | 244 | 245 | 246 | // 247 | // MPC tracking loop 248 | // 249 | for(control_update_step = 0; control_update_step < max_control_updates; control_update_step++){ 250 | 251 | 252 | if (traj_offset == traj_steps){ break; } 253 | 254 | 255 | 256 | #if LIVE_PRINT_PATH 257 | grid::end_effector_positions_kernel<<<1,128,144*sizeof(T)>>>(d_eePos, d_xs, grid::NUM_JOINTS, (grid::robotModel *) d_dynmem, 1); 258 | gpuErrchk(cudaMemcpy(h_eePos, d_eePos, 6*sizeof(T), cudaMemcpyDeviceToHost)); 259 | for (uint32_t i = 0; i < 6; i++){ 260 | std::cout << h_eePos[i] << (i < 5 ? " " : "\n"); 261 | } 262 | #endif // #if LIVE_PRINT_PATH 263 | 264 | 265 | 266 | #if LINSYS_SOLVE == 1 267 | sqp_stats = sqpSolvePcg(state_size, control_size, knot_points, timestep, d_eePos_goal, d_lambda, d_xu, d_dynmem, config, rho, rho_reset); 268 | #else 269 | sqp_stats = sqpSolveQdldl(state_size, control_size, knot_points, timestep, d_eePos_goal, d_lambda, d_xu, d_dynmem, rho, rho_reset); 270 | #endif 271 | 272 | cur_linsys_iters = std::get<0>(sqp_stats); 273 | cur_linsys_times = std::get<1>(sqp_stats); 274 | sqp_solve_time_us = std::get<2>(sqp_stats); 275 | cur_sqp_iters = std::get<3>(sqp_stats); 276 | sqp_exits.push_back(std::get<4>(sqp_stats)); 277 | cur_linsys_exits = std::get<5>(sqp_stats); 278 | 279 | 280 | #if CONST_UPDATE_FREQ 281 | simulation_time = SIMULATION_PERIOD; 282 | #else 283 | simulation_time = sqp_solve_time_us; 284 | #endif 285 | 286 | 287 | // simulate traj for current solve time, offset by previous solve time 288 | simple_simulate(state_size, control_size, knot_points, d_xs, d_xu_old, d_dynmem, timestep, prev_simulation_time, simulation_time); 289 | 290 | // old xu = new xu 291 | gpuErrchk(cudaMemcpy(d_xu_old, d_xu, traj_len*sizeof(T), cudaMemcpyDeviceToDevice)); 292 | 293 | 294 | time_since_timestep += simulation_time * 1e-6; 295 | 296 | // if shift_threshold% through timestep 297 | if (!shifted && time_since_timestep > shift_threshold){ 298 | 299 | // record tracking error 300 | grid::end_effector_positions_kernel<<<1,128,144*sizeof(T)>>>(d_eePos, d_xs, grid::NUM_JOINTS, (grid::robotModel *) d_dynmem, 1); 301 | gpuErrchk(cudaMemcpy(h_eePos, d_eePos, 6*sizeof(T), cudaMemcpyDeviceToHost)); 302 | gpuErrchk(cudaMemcpy(h_eePos_goal, d_eePos_goal, 6*sizeof(T), cudaMemcpyDeviceToHost)); 303 | cur_tracking_error = 0.0; 304 | for(uint32_t i=0; i < 3; i++){ 305 | cur_tracking_error += abs(h_eePos[i] - h_eePos_goal[i]); 306 | } 307 | // std::cout << cur_tracking_error << std::endl;; 308 | tracking_errors.push_back(cur_tracking_error); 309 | 310 | traj_offset++; 311 | 312 | // shift xu 313 | just_shift(state_size, control_size, knot_points, d_xu); // shift everything over one 314 | if (traj_offset + knot_points < traj_steps){ 315 | // if within precomputed traj, fill in last state, control with precompute 316 | gpuErrchk(cudaMemcpy(&d_xu[traj_len - (state_size + control_size)], &d_xu_traj[(state_size+control_size)*traj_offset - control_size], sizeof(T)*(state_size+control_size), cudaMemcpyDeviceToDevice)); // last state filled from precomputed trajectory 317 | } 318 | else{ 319 | // fill in last state with goal position, zero velocity, last control with zero control 320 | gpuErrchk(cudaMemcpy(&d_xu[traj_len - state_size], &d_xu_traj[(traj_steps-1)*(state_size+control_size)], (state_size/2)*sizeof(T), cudaMemcpyDeviceToDevice)); 321 | gpuErrchk(cudaMemset(&d_xu[traj_len - state_size / 2], 0, (state_size/2) * sizeof(T))); 322 | gpuErrchk(cudaMemset(&d_xu[traj_len - (state_size+control_size)], 0, control_size * sizeof(T))); 323 | } 324 | 325 | // shift goal 326 | just_shift(6, 0, knot_points, d_eePos_goal); 327 | if (traj_offset + knot_points < traj_steps){ 328 | gpuErrchk(cudaMemcpy(&d_eePos_goal[(knot_points-1)*(6)], &d_eePos_traj[(traj_offset+knot_points-1) * (6)], 6*sizeof(T), cudaMemcpyDeviceToDevice)); 329 | } 330 | else{ 331 | // fill in last goal state with goal state and zero velocity 332 | gpuErrchk(cudaMemcpy(&d_eePos_goal[(knot_points-1)*(6)], &d_eePos_traj[(traj_steps-1)*(6)], (6)*sizeof(T), cudaMemcpyDeviceToDevice)); 333 | // gpuErrchk(cudaMemset(&d_eePos_goal[(knot_points-1)*(6) + state_size / 2], 0, (state_size/2) * sizeof(T))); 334 | } 335 | 336 | // shift lambda 337 | just_shift(state_size, 0, knot_points, d_lambda); 338 | // gpuErrchk(cudaMemset(&lambdas[i][state_size*(knot_points-1)], 0, state_size*sizeof(T))); 339 | 340 | shifted = true; 341 | } 342 | 343 | if (time_since_timestep > timestep){ 344 | // std::cout << "shifted to offset: " << traj_offset + 1 << std::endl; 345 | shifted = false; 346 | time_since_timestep = std::fmod(time_since_timestep, timestep); 347 | } 348 | gpuErrchk(cudaMemcpy(d_xu, d_xs, state_size*sizeof(T), cudaMemcpyDeviceToDevice)); 349 | 350 | 351 | 352 | prev_simulation_time = simulation_time; 353 | 354 | gpuErrchk(cudaPeekAtLastError()); 355 | 356 | 357 | // record data 358 | linsys_iters.insert(linsys_iters.end(), cur_linsys_iters.begin(), cur_linsys_iters.end()); // linsys iters 359 | linsys_times.insert(linsys_times.end(), cur_linsys_times.begin(), cur_linsys_times.end()); // linsys times 360 | linsys_exits.insert(linsys_exits.end(), cur_linsys_exits.begin(), cur_linsys_exits.end()); 361 | gpuErrchk(cudaMemcpy(h_xs, d_xs, state_size*sizeof(T), cudaMemcpyDeviceToHost)); 362 | tracking_path.push_back(std::vector(h_xs, &h_xs[state_size])); // next state 363 | sqp_times.push_back(sqp_solve_time_us); 364 | sqp_iters.push_back(cur_sqp_iters); 365 | 366 | 367 | #if LIVE_PRINT_STATS 368 | if (control_update_step % 1000 == 50){ 369 | for (uint32_t i = 0; i < state_size; i++){ 370 | std::cout << h_xs[i] << (i < state_size-1 ? " " : "\n"); 371 | } 372 | #if TIME_LINSYS == 1 373 | std::cout << "linear system solve time:" << std::endl; 374 | printStats(&linsys_times); 375 | #endif // #if TIME_LINSYS 376 | std::cout << "goal offset [" << traj_offset << "]\n"; 377 | std::cout << "sqp iters" << std::endl; 378 | printStats(&sqp_iters); 379 | std::cout << "sqp times" << std::endl; 380 | printStats(&sqp_times); 381 | 382 | int totalOnes = std::accumulate(linsys_exits.begin(), linsys_exits.end(), 0); 383 | double max_iter_pct = (static_cast(totalOnes) / linsys_exits.size()); 384 | std::cout << "linsys exits for max iter: " << max_iter_pct * 100 << "% of the time\n"; 385 | if (max_iter_pct > 0.5) { 386 | std::cout << "WARNING: PCG exiting for max iter over 50% of the time" << std::endl; 387 | } 388 | 389 | std::cout << "avg tracking error: " << std::accumulate(tracking_errors.begin(), tracking_errors.end(), 0.0f) / traj_offset << " current error: " << cur_tracking_error << "\n"; 390 | std::cout << std::endl; 391 | 392 | } 393 | 394 | #endif 395 | 396 | 397 | } 398 | #if SAVE_DATA 399 | dump_tracking_data(&linsys_iters, &linsys_exits, &linsys_times, &sqp_times, &sqp_iters, &sqp_exits, &tracking_errors, &tracking_path, 400 | traj_offset, control_update_step, start_state_ind, goal_state_ind, test_iter, test_output_prefix); 401 | #endif 402 | 403 | 404 | grid::end_effector_positions_kernel<<<1,128,144*sizeof(T)>>>(d_eePos, d_xs, grid::NUM_JOINTS, (grid::robotModel *) d_dynmem, 1); 405 | gpuErrchk(cudaMemcpy(h_eePos, d_eePos, 6*sizeof(T), cudaMemcpyDeviceToHost)); 406 | gpuErrchk(cudaMemcpy(h_eePos_goal, d_eePos_goal, 6*sizeof(T), cudaMemcpyDeviceToHost)); 407 | cur_tracking_error = 0.0; 408 | for(uint32_t i=0; i < 3; i++){ 409 | cur_tracking_error += abs(h_eePos[i] - h_eePos_goal[i]); 410 | } 411 | 412 | gato_plant::freeDynamicsConstMem(d_dynmem); 413 | 414 | gpuErrchk(cudaFree(d_lambda)); 415 | gpuErrchk(cudaFree(d_xu)); 416 | gpuErrchk(cudaFree(d_eePos_goal)); 417 | gpuErrchk(cudaFree(d_xu_old)); 418 | 419 | gpuErrchk(cudaFree(d_eePos)); 420 | 421 | #if TIME_LINSYS == 1 422 | return std::make_tuple(linsys_times, tracking_errors, cur_tracking_error); 423 | #else 424 | return std::make_tuple(sqp_iters, tracking_errors, cur_tracking_error); 425 | #endif 426 | } 427 | -------------------------------------------------------------------------------- /include/pcg/linsys_setup.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "gpuassert.cuh" 4 | #include "glass.cuh" 5 | #include "utils/matrix.cuh" 6 | 7 | 8 | 9 | template 10 | __device__ 11 | void complete_SS_Pinv_blockrow(uint32_t state_size, uint32_t knot_points, T *d_S, T *d_Pinv, T *d_gamma, T *s_temp, unsigned blockrow){ 12 | 13 | const uint32_t states_sq = state_size*state_size; 14 | 15 | // STATE OF DEVICE MEM 16 | // S: -Q0_i in spot 00, phik left off-diagonal, thetak main diagonal, phik_T right off-diagonal 17 | // Phi: -Q0 in spot 00, theta_invk main diagonal 18 | // gamma: -Q0_i*q0 spot 0, gammak 19 | 20 | 21 | // GOAL SPACE ALLOCATION IN SHARED MEM 22 | // s_temp = | phi_k_T | phi_k | phi_kp1 | thetaInv_k | thetaInv_kp1 | thetaInv_km1 | PhiInv_R | PhiInv_L | scratch 23 | T *s_phi_k = s_temp; 24 | T *s_phi_kp1_T = s_phi_k + states_sq; 25 | T *s_thetaInv_k = s_phi_kp1_T + states_sq; 26 | T *s_thetaInv_km1 = s_thetaInv_k + states_sq; 27 | T *s_thetaInv_kp1 = s_thetaInv_km1 + states_sq; 28 | T *s_PhiInv_k_R = s_thetaInv_kp1 + states_sq; 29 | T *s_PhiInv_k_L = s_PhiInv_k_R + states_sq; 30 | T *s_scratch = s_PhiInv_k_L + states_sq; 31 | 32 | const unsigned lastrow = knot_points - 1; 33 | 34 | // load phi_kp1_T 35 | if(blockrow!=lastrow){ 36 | load_block_bd( 37 | state_size, knot_points, 38 | d_S, // src 39 | s_phi_kp1_T, // dst 40 | 0, // block column (0, 1, or 2) 41 | blockrow+1, // block row 42 | true // transpose 43 | ); 44 | } 45 | 46 | 47 | // load phi_k 48 | if(blockrow!=0){ 49 | load_block_bd( 50 | state_size, 51 | knot_points, 52 | d_S, 53 | s_phi_k, 54 | 0, 55 | blockrow 56 | ); 57 | } 58 | 59 | 60 | 61 | // load thetaInv_k 62 | load_block_bd( 63 | state_size, knot_points, 64 | d_Pinv, 65 | s_thetaInv_k, 66 | 1, 67 | blockrow 68 | ); 69 | 70 | 71 | // load thetaInv_km1 72 | if(blockrow!=0){ 73 | load_block_bd( 74 | state_size, knot_points, 75 | d_Pinv, 76 | s_thetaInv_km1, 77 | 1, 78 | blockrow-1 79 | ); 80 | } 81 | 82 | 83 | // load thetaInv_kp1 84 | if(blockrow!=lastrow){ 85 | load_block_bd( 86 | state_size, knot_points, 87 | d_Pinv, 88 | s_thetaInv_kp1, 89 | 1, 90 | blockrow+1 91 | ); 92 | } 93 | 94 | 95 | __syncthreads();//---------------------------------------------------------------- 96 | 97 | if(blockrow!=0){ 98 | 99 | // compute left off diag 100 | glass::gemm(state_size, state_size, state_size , static_cast(1.0), s_thetaInv_k, s_phi_k, s_scratch); 101 | __syncthreads();//---------------------------------------------------------------- 102 | glass::gemm(state_size, state_size, state_size, static_cast(1.0), s_scratch, s_thetaInv_km1, s_PhiInv_k_L); 103 | __syncthreads();//---------------------------------------------------------------- 104 | 105 | // store left diagonal in Phi 106 | store_block_bd( 107 | state_size, knot_points, 108 | s_PhiInv_k_L, 109 | d_Pinv, 110 | 0, 111 | blockrow, 112 | -1 113 | ); 114 | __syncthreads();//---------------------------------------------------------------- 115 | } 116 | 117 | 118 | if(blockrow!=lastrow){ 119 | 120 | // calculate Phi right diag 121 | glass::gemm(state_size, state_size, state_size , static_cast(1.0), s_thetaInv_k, s_phi_kp1_T, s_scratch); 122 | __syncthreads();//---------------------------------------------------------------- 123 | glass::gemm(state_size, state_size, state_size, static_cast(1.0), s_scratch, s_thetaInv_kp1, s_PhiInv_k_R); 124 | __syncthreads();//---------------------------------------------------------------- 125 | 126 | // store Phi right diag 127 | store_block_bd( 128 | state_size, knot_points, 129 | s_PhiInv_k_R, 130 | d_Pinv, 131 | 2, 132 | blockrow, 133 | -1 134 | ); 135 | 136 | } 137 | } 138 | 139 | template 140 | __device__ 141 | void form_S_gamma_and_jacobi_Pinv_blockrow(uint32_t state_size, uint32_t control_size, uint32_t knot_points, T *d_G, T *d_C, T *d_g, T *d_c, T *d_S, T *d_Pinv, T *d_gamma, T rho, T *s_temp, unsigned blockrow){ 142 | 143 | // SPACE ALLOCATION IN SHARED MEM 144 | // | phi_k | theta_k | thetaInv_k | gamma_k | block-specific... 145 | // s^2 s^2 s^2 s 146 | T *s_phi_k = s_temp; // phi_k states^2 147 | T *s_theta_k = s_phi_k + state_size*state_size; // theta_k states^2 148 | T *s_thetaInv_k = s_theta_k + state_size*state_size; // thetaInv_k states^2 149 | T *s_gamma_k = s_thetaInv_k + state_size*state_size; // gamma_k states 150 | T *s_end_main = s_gamma_k + state_size; 151 | 152 | if(blockrow==0){ 153 | 154 | // LEADING BLOCK GOAL SHARED MEMORY STATE 155 | // ...gamma_k | . | Q_N_I | q_N | . | Q_0_I | q_0 | scatch 156 | // s^2 s^2 s s^2 s^2 s ? 157 | 158 | T *s_QN = s_end_main; 159 | T *s_QN_i = s_QN + state_size * state_size; 160 | T *s_qN = s_QN_i + state_size * state_size; 161 | T *s_Q0 = s_qN + state_size; 162 | T *s_Q0_i = s_Q0 + state_size * state_size; 163 | T *s_q0 = s_Q0_i + state_size * state_size; 164 | T *s_end = s_q0 + state_size; 165 | 166 | // scratch space 167 | T *s_R_not_needed = s_end; 168 | T *s_r_not_needed = s_R_not_needed + control_size * control_size; 169 | T *s_extra_temp = s_r_not_needed + control_size * control_size; 170 | 171 | __syncthreads();//---------------------------------------------------------------- 172 | 173 | glass::copy(state_size*state_size, d_G, s_Q0); 174 | glass::copy(state_size*state_size, d_G+(knot_points-1)*(state_size*state_size+control_size*control_size), s_QN); 175 | glass::copy(state_size, d_g, s_q0); 176 | glass::copy(state_size, d_g+(knot_points-1)*(state_size+control_size), s_qN); 177 | 178 | __syncthreads();//---------------------------------------------------------------- 179 | 180 | add_identity(s_Q0, state_size, rho); 181 | add_identity(s_QN, state_size, rho); 182 | // if(PRINT_THREAD){ 183 | // printf("Q0\n"); 184 | // printMat(s_Q0,state_size); 185 | // printf("q0\n"); 186 | // printMat<1,state_size>(s_q0,1); 187 | // printf("QN\n"); 188 | // printMat(s_QN,state_size); 189 | // printf("qN\n"); 190 | // printMat<1,state_size>(s_qN,1); 191 | // printf("start error\n"); 192 | // printMat<1,state_size>(s_integrator_error,1); 193 | // printf("\n"); 194 | // } 195 | __syncthreads();//---------------------------------------------------------------- 196 | 197 | // SHARED MEMORY STATE 198 | // | Q_N | . | q_N | Q_0 | . | q_0 | scatch 199 | 200 | 201 | // save -Q_0 in PhiInv spot 00 202 | store_block_bd( 203 | state_size, 204 | knot_points, 205 | s_Q0, // src 206 | d_Pinv, // dst 207 | 1, // col 208 | blockrow, // blockrow 209 | -1 // multiplier 210 | ); 211 | __syncthreads();//---------------------------------------------------------------- 212 | 213 | 214 | // invert Q_N, Q_0 215 | loadIdentity( state_size,state_size,s_Q0_i, s_QN_i); 216 | __syncthreads();//---------------------------------------------------------------- 217 | invertMatrix( state_size,state_size,state_size,s_Q0, s_QN, s_extra_temp); 218 | 219 | __syncthreads();//---------------------------------------------------------------- 220 | 221 | 222 | // if(PRINT_THREAD){ 223 | // printf("Q0Inv\n"); 224 | // printMat(s_Q0_i,state_size); 225 | // printf("QNInv\n"); 226 | // printMat(s_QN_i,state_size); 227 | // printf("theta\n"); 228 | // printMat(s_theta_k,state_size); 229 | // printf("thetaInv\n"); 230 | // printMat(s_thetaInv_k,state_size); 231 | // printf("\n"); 232 | // } 233 | __syncthreads();//---------------------------------------------------------------- 234 | 235 | // SHARED MEMORY STATE 236 | // | . | Q_N_i | q_N | . | Q_0_i | q_0 | scatch 237 | 238 | 239 | // compute gamma 240 | mat_vec_prod( state_size, state_size, 241 | s_Q0_i, 242 | s_q0, 243 | s_gamma_k 244 | ); 245 | __syncthreads();//---------------------------------------------------------------- 246 | 247 | 248 | // save -Q0_i in spot 00 in S 249 | store_block_bd( state_size, knot_points, 250 | s_Q0_i, // src 251 | d_S, // dst 252 | 1, // col 253 | blockrow, // blockrow 254 | -1 // multiplier 255 | ); 256 | __syncthreads();//---------------------------------------------------------------- 257 | 258 | 259 | // compute Q0^{-1}q0 260 | mat_vec_prod( state_size, state_size, 261 | s_Q0_i, 262 | s_q0, 263 | s_Q0 264 | ); 265 | __syncthreads();//---------------------------------------------------------------- 266 | 267 | 268 | // SHARED MEMORY STATE 269 | // | . | Q_N_i | q_N | Q0^{-1}q0 | Q_0_i | q_0 | scatch 270 | 271 | 272 | // save -Q0^{-1}q0 in spot 0 in gamma 273 | for(unsigned ind = threadIdx.x; ind < state_size; ind += blockDim.x){ 274 | d_gamma[ind] = -s_Q0[ind]; 275 | } 276 | __syncthreads();//---------------------------------------------------------------- 277 | 278 | } 279 | else{ // blockrow!=LEAD_BLOCK 280 | 281 | 282 | const unsigned C_set_size = state_size*state_size+state_size*control_size; 283 | const unsigned G_set_size = state_size*state_size+control_size*control_size; 284 | 285 | // NON-LEADING BLOCK GOAL SHARED MEMORY STATE 286 | // ...gamma_k | A_k | B_k | . | Q_k_I | . | Q_k+1_I | . | R_k_I | q_k | q_k+1 | r_k | integrator_error | extra_temp 287 | // s^2 s*c s^2 s^2 s^2 s^2 s^2 s^2 s s s s (s_xux,1); 309 | // printf("uk\n"); 310 | // printMat(&s_xux[state_size],1); 311 | // printf("xkp1\n"); 312 | // printMat(&s_xux[state_size+control_size],1); 313 | // printf("\n"); 314 | // } 315 | 316 | __syncthreads();//---------------------------------------------------------------- 317 | 318 | glass::copy(state_size*state_size, d_C+ (blockrow-1)*C_set_size, s_Ak); 319 | glass::copy(state_size*control_size, d_C+ (blockrow-1)*C_set_size+state_size*state_size, s_Bk); 320 | glass::copy(state_size*state_size, d_G+ (blockrow-1)*G_set_size, s_Qk); 321 | glass::copy(state_size*state_size, d_G+ (blockrow*G_set_size), s_Qkp1); 322 | glass::copy(control_size*control_size, d_G+ ((blockrow-1)*G_set_size+state_size*state_size), s_Rk); 323 | glass::copy(state_size, d_g+ (blockrow-1)*(state_size+control_size), s_qk); 324 | glass::copy(state_size, d_g+ (blockrow)*(state_size+control_size), s_qkp1); 325 | glass::copy(control_size, d_g+ ((blockrow-1)*(state_size+control_size)+state_size), s_rk); 326 | 327 | __syncthreads();//---------------------------------------------------------------- 328 | 329 | add_identity(s_Qk, state_size, rho); 330 | add_identity(s_Qkp1, state_size, rho); 331 | add_identity(s_Rk, control_size, rho); 332 | 333 | #if DEBUG_MODE 334 | if(blockIdx.x==1 && threadIdx.x==0){ 335 | printf("Ak\n"); 336 | printMat(s_Ak,state_size); 337 | printf("Bk\n"); 338 | printMat(s_Bk,state_size); 339 | printf("Qk\n"); 340 | printMat(s_Qk,state_size); 341 | printf("Rk\n"); 342 | printMat(s_Rk,control_size); 343 | printf("qk\n"); 344 | printMat(s_qk,1); 345 | printf("rk\n"); 346 | printMat(s_rk,1); 347 | printf("Qkp1\n"); 348 | printMat(s_Qkp1,state_size); 349 | printf("qkp1\n"); 350 | printMat(s_qkp1,1); 351 | printf("integrator error\n"); 352 | } 353 | __syncthreads();//---------------------------------------------------------------- 354 | #endif /* #if DEBUG_MODE */ 355 | 356 | // Invert Q, Qp1, R 357 | loadIdentity( state_size,state_size,control_size, 358 | s_Qk_i, 359 | s_Qkp1_i, 360 | s_Rk_i 361 | ); 362 | __syncthreads();//---------------------------------------------------------------- 363 | invertMatrix( state_size,state_size,control_size,state_size, 364 | s_Qk, 365 | s_Qkp1, 366 | s_Rk, 367 | s_extra_temp 368 | ); 369 | __syncthreads();//---------------------------------------------------------------- 370 | 371 | // save Qk_i into G (now Ginv) for calculating dz 372 | glass::copy(state_size*state_size, s_Qk_i, d_G+(blockrow-1)*G_set_size); 373 | 374 | // save Rk_i into G (now Ginv) for calculating dz 375 | glass::copy(control_size*control_size, s_Rk_i, d_G+(blockrow-1)*G_set_size+state_size*state_size); 376 | 377 | if(blockrow==knot_points-1){ 378 | // save Qkp1_i into G (now Ginv) for calculating dz 379 | glass::copy(state_size*state_size, s_Qkp1_i, d_G+(blockrow)*G_set_size); 380 | } 381 | __syncthreads();//---------------------------------------------------------------- 382 | 383 | #if DEBUG_MODE 384 | if(blockrow==1&&threadIdx.x==0){ 385 | printf("Qk\n"); 386 | printMat< state_size,state_size>(s_Qk_i,state_size); 387 | printf("RkInv\n"); 388 | printMat(s_Rk_i,control_size); 389 | printf("Qkp1Inv\n"); 390 | printMat< state_size,state_size>(s_Qkp1_i,state_size); 391 | printf("\n"); 392 | } 393 | __syncthreads();//---------------------------------------------------------------- 394 | #endif /* #if DEBUG_MODE */ 395 | 396 | 397 | // Compute -AQ^{-1} in phi 398 | glass::gemm(state_size, state_size, state_size, static_cast(1.0), s_Ak, s_Qk_i, s_phi_k); 399 | // for(int i = threadIdx.x; i < state_size*state_size; i++){ 400 | // s_phi_k[i] *= -1; 401 | // } 402 | 403 | __syncthreads();//---------------------------------------------------------------- 404 | 405 | // Compute -BR^{-1} in Qkp1 406 | glass::gemm(state_size, control_size, control_size, static_cast(1.0), s_Bk, s_Rk_i, s_Qkp1); 407 | 408 | __syncthreads();//---------------------------------------------------------------- 409 | 410 | // compute Q_{k+1}^{-1}q_{k+1} - IntegratorError in gamma 411 | mat_vec_prod( state_size, state_size, 412 | s_Qkp1_i, 413 | s_qkp1, 414 | s_gamma_k 415 | ); 416 | for(unsigned i = threadIdx.x; i < state_size; i += blockDim.x){ 417 | s_gamma_k[i] -= d_c[(blockrow*state_size)+i]; 418 | } 419 | __syncthreads();//---------------------------------------------------------------- 420 | 421 | // compute -AQ^{-1}q for gamma temp storage in extra temp 422 | mat_vec_prod( state_size, state_size, 423 | s_phi_k, 424 | s_qk, 425 | s_extra_temp 426 | ); 427 | 428 | 429 | __syncthreads();//---------------------------------------------------------------- 430 | 431 | // compute -BR^{-1}r for gamma temp storage in extra temp + states 432 | mat_vec_prod( state_size, control_size, 433 | s_Qkp1, 434 | s_rk, 435 | s_extra_temp + state_size 436 | ); 437 | 438 | __syncthreads();//---------------------------------------------------------------- 439 | 440 | // gamma = yeah... 441 | for(unsigned i = threadIdx.x; i < state_size; i += blockDim.x){ 442 | s_gamma_k[i] += s_extra_temp[state_size + i] + s_extra_temp[i]; 443 | } 444 | __syncthreads();//---------------------------------------------------------------- 445 | 446 | // compute AQ^{-1}AT - Qkp1^{-1} for theta 447 | glass::gemm( 448 | state_size, 449 | state_size, 450 | state_size, 451 | static_cast(1.0), 452 | s_phi_k, 453 | s_Ak, 454 | s_theta_k 455 | ); 456 | 457 | __syncthreads();//---------------------------------------------------------------- 458 | 459 | #if DEBUG_MODE 460 | if(blockrow==1&&threadIdx.x==0){ 461 | printf("this is the A thing\n"); 462 | printMat< state_size, state_size>(s_theta_k, 234); 463 | } 464 | #endif /* #if DEBUG_MODE */ 465 | 466 | for(unsigned i = threadIdx.x; i < state_size*state_size; i += blockDim.x){ 467 | s_theta_k[i] += s_Qkp1_i[i]; 468 | } 469 | 470 | __syncthreads();//---------------------------------------------------------------- 471 | 472 | // compute BR^{-1}BT for theta temp storage in QKp1{-1} 473 | glass::gemm( 474 | state_size, 475 | control_size, 476 | state_size, 477 | static_cast(1.0), 478 | s_Qkp1, 479 | s_Bk, 480 | s_Qkp1_i 481 | ); 482 | 483 | __syncthreads();//---------------------------------------------------------------- 484 | 485 | for(unsigned i = threadIdx.x; i < state_size*state_size; i += blockDim.x){ 486 | s_theta_k[i] += s_Qkp1_i[i]; 487 | } 488 | __syncthreads();//---------------------------------------------------------------- 489 | 490 | // save phi_k into left off-diagonal of S, 491 | store_block_bd( state_size, knot_points, 492 | s_phi_k, // src 493 | d_S, // dst 494 | 0, // col 495 | blockrow, // blockrow 496 | -1 497 | ); 498 | __syncthreads();//---------------------------------------------------------------- 499 | 500 | // save -s_theta_k main diagonal S 501 | store_block_bd( state_size, knot_points, 502 | s_theta_k, 503 | d_S, 504 | 1, 505 | blockrow, 506 | -1 507 | ); 508 | __syncthreads();//---------------------------------------------------------------- 509 | 510 | // invert theta 511 | loadIdentity(state_size,s_thetaInv_k); 512 | __syncthreads();//---------------------------------------------------------------- 513 | invertMatrix(state_size,s_theta_k, s_extra_temp); 514 | __syncthreads();//---------------------------------------------------------------- 515 | 516 | 517 | // save thetaInv_k main diagonal PhiInv 518 | store_block_bd( state_size, knot_points, 519 | s_thetaInv_k, 520 | d_Pinv, 521 | 1, 522 | blockrow, 523 | -1 524 | ); 525 | 526 | __syncthreads();//---------------------------------------------------------------- 527 | 528 | // save gamma_k in gamma 529 | for(unsigned ind = threadIdx.x; ind < state_size; ind += blockDim.x){ 530 | unsigned offset = (blockrow)*state_size + ind; 531 | d_gamma[offset] = s_gamma_k[ind]*-1; 532 | } 533 | 534 | __syncthreads();//---------------------------------------------------------------- 535 | 536 | //transpose phi_k 537 | loadIdentity(state_size,s_Ak); 538 | __syncthreads();//---------------------------------------------------------------- 539 | glass::gemm( 540 | state_size, 541 | state_size, 542 | state_size, 543 | static_cast(1.0), 544 | s_Ak, 545 | s_phi_k, 546 | s_Qkp1 547 | ); 548 | __syncthreads();//---------------------------------------------------------------- 549 | 550 | // save phi_k_T into right off-diagonal of S, 551 | store_block_bd( state_size, knot_points, 552 | s_Qkp1, // src 553 | d_S, // dst 554 | 2, // col 555 | blockrow-1, // blockrow 556 | -1 557 | ); 558 | 559 | __syncthreads();//---------------------------------------------------------------- 560 | } 561 | 562 | } 563 | 564 | 565 | template 566 | __global__ 567 | void form_S_gamma_Pinv_kernel( 568 | uint32_t state_size, 569 | uint32_t control_size, 570 | uint32_t knot_points, 571 | T *d_G, 572 | T *d_C, 573 | T *d_g, 574 | T *d_c, 575 | T *d_S, 576 | T *d_Pinv, 577 | T *d_gamma, 578 | T rho 579 | ){ 580 | 581 | extern __shared__ T s_temp[ ]; 582 | 583 | for(unsigned blockrow=blockIdx.x; blockrow( 585 | state_size, 586 | control_size, 587 | knot_points, 588 | d_G, 589 | d_C, 590 | d_g, 591 | d_c, 592 | d_S, 593 | d_Pinv, 594 | d_gamma, 595 | rho, 596 | s_temp, 597 | blockrow 598 | ); 599 | } 600 | cgrps::this_grid().sync(); 601 | 602 | for(unsigned blockrow=blockIdx.x; blockrow( 604 | state_size, knot_points, 605 | d_S, 606 | d_Pinv, 607 | d_gamma, 608 | s_temp, 609 | blockrow 610 | ); 611 | } 612 | } 613 | 614 | 615 | /******************************************************************************* 616 | * Interface Functions * 617 | *******************************************************************************/ 618 | 619 | 620 | template 621 | void form_schur_system( 622 | uint32_t state_size, 623 | uint32_t control_size, 624 | uint32_t knot_points, 625 | T *d_G_dense, 626 | T *d_C_dense, 627 | T *d_g, 628 | T *d_c, 629 | T *d_S, 630 | T *d_Pinv, 631 | T *d_gamma, 632 | T rho 633 | ){ 634 | const uint32_t s_temp_size = sizeof(T)*(8 * state_size*state_size + 635 | 7 * state_size + 636 | state_size * control_size + 637 | 3 * control_size + 638 | 2 * control_size * control_size + 639 | 3); 640 | 641 | void *kernel = (void *) form_S_gamma_Pinv_kernel; 642 | void *args[] = { 643 | (void *) &state_size, 644 | (void *) &control_size, 645 | (void *) &knot_points, 646 | (void *) &d_G_dense, 647 | (void *) &d_C_dense, 648 | (void *) &d_g, 649 | (void *) &d_c, 650 | (void *) &d_S, 651 | (void *) &d_Pinv, 652 | (void *) &d_gamma, 653 | (void *) &rho 654 | }; 655 | 656 | gpuErrchk(cudaLaunchCooperativeKernel(kernel, knot_points, SCHUR_THREADS, args, s_temp_size)); 657 | } -------------------------------------------------------------------------------- /include/pcg/sqp.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "linsys_setup.cuh" 15 | #include "common/kkt.cuh" 16 | #include "common/dz.cuh" 17 | #include "merit.cuh" 18 | #include "gpu_pcg.cuh" 19 | #include "settings.cuh" 20 | 21 | template 22 | auto sqpSolvePcg(const uint32_t state_size, const uint32_t control_size, const uint32_t knot_points, float timestep, T *d_eePos_traj, T *d_lambda, T *d_xu, void *d_dynMem_const, pcg_config& config, T &rho, T rho_reset){ 23 | 24 | // data storage 25 | std::vector pcg_iter_vec; 26 | std::vector pcg_exit_vec; 27 | std::vector linsys_time_vec; 28 | bool sqp_time_exit = 1; // for data recording, not a flag 29 | 30 | 31 | 32 | // sqp timing 33 | struct timespec sqp_solve_start, sqp_solve_end; 34 | gpuErrchk(cudaDeviceSynchronize()); 35 | clock_gettime(CLOCK_MONOTONIC, &sqp_solve_start); 36 | 37 | 38 | 39 | const uint32_t states_sq = state_size*state_size; 40 | const uint32_t states_p_controls = state_size * control_size; 41 | const uint32_t controls_sq = control_size * control_size; 42 | const uint32_t states_s_controls = state_size + control_size; 43 | const uint32_t KKT_G_DENSE_SIZE_BYTES = static_cast(((states_sq+controls_sq)*knot_points-controls_sq)*sizeof(T)); 44 | const uint32_t KKT_C_DENSE_SIZE_BYTES = static_cast((states_sq+states_p_controls)*(knot_points-1)*sizeof(T)); 45 | const uint32_t KKT_g_SIZE_BYTES = static_cast(((state_size+control_size)*knot_points-control_size)*sizeof(T)); 46 | const uint32_t KKT_c_SIZE_BYTES = static_cast((state_size*knot_points)*sizeof(T)); 47 | const uint32_t DZ_SIZE_BYTES = static_cast((states_s_controls*knot_points-control_size)*sizeof(T)); 48 | 49 | 50 | // line search things 51 | const float mu = 10.0f; 52 | const uint32_t num_alphas = 8; 53 | T h_merit_news[num_alphas]; 54 | void *ls_merit_kernel = (void *) ls_gato_compute_merit; 55 | const size_t merit_smem_size = get_merit_smem_size(state_size, control_size); 56 | T h_merit_initial, min_merit; 57 | T alphafinal; 58 | T delta_merit_iter = 0; 59 | T delta_merit_total = 0; 60 | uint32_t line_search_step = 0; 61 | 62 | 63 | // streams n cublas init 64 | cudaStream_t streams[num_alphas]; 65 | for(uint32_t str = 0; str < num_alphas; str++){ 66 | cudaStreamCreate(&streams[str]); 67 | } 68 | gpuErrchk(cudaPeekAtLastError()); 69 | 70 | cublasHandle_t handle; 71 | if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); exit(13); } 72 | gpuErrchk(cudaPeekAtLastError()); 73 | 74 | 75 | uint32_t sqp_iter = 0; 76 | 77 | 78 | 79 | T *d_merit_initial, *d_merit_news, *d_merit_temp, 80 | *d_G_dense, *d_C_dense, *d_g, *d_c, *d_Ginv_dense, 81 | *d_S, *d_gamma, 82 | *d_dz, 83 | *d_xs; 84 | 85 | 86 | T drho = 1.0; 87 | T rho_factor = RHO_FACTOR; 88 | T rho_max = RHO_MAX; 89 | T rho_min = RHO_MIN; 90 | 91 | 92 | 93 | 94 | gpuErrchk(cudaMalloc(&d_G_dense, KKT_G_DENSE_SIZE_BYTES)); 95 | gpuErrchk(cudaMalloc(&d_C_dense, KKT_C_DENSE_SIZE_BYTES)); 96 | gpuErrchk(cudaMalloc(&d_g, KKT_g_SIZE_BYTES)); 97 | gpuErrchk(cudaMalloc(&d_c, KKT_c_SIZE_BYTES)); 98 | d_Ginv_dense = d_G_dense; 99 | 100 | gpuErrchk(cudaMalloc(&d_S, 3*states_sq*knot_points*sizeof(T))); 101 | gpuErrchk(cudaMalloc(&d_gamma, state_size*knot_points*sizeof(T))); 102 | gpuErrchk(cudaPeekAtLastError()); 103 | 104 | 105 | gpuErrchk(cudaMalloc(&d_dz, DZ_SIZE_BYTES)); 106 | gpuErrchk(cudaMalloc(&d_xs, state_size*sizeof(T))); 107 | gpuErrchk(cudaMemcpy(d_xs, d_xu, state_size*sizeof(T), cudaMemcpyDeviceToDevice)); 108 | gpuErrchk(cudaMalloc(&d_merit_news, 8*sizeof(T))); 109 | gpuErrchk(cudaMalloc(&d_merit_temp, 8*knot_points*sizeof(T))); 110 | // pcg iterates 111 | 112 | gpuErrchk(cudaMalloc(&d_merit_initial, sizeof(T))); 113 | gpuErrchk(cudaMemset(d_merit_initial, 0, sizeof(T))); 114 | 115 | 116 | // pcg things 117 | T *d_Pinv; 118 | gpuErrchk(cudaMalloc(&d_Pinv, 3*states_sq*knot_points*sizeof(T))); 119 | 120 | /* PCG vars */ 121 | T *d_r, *d_p, *d_v_temp, *d_eta_new_temp;// *d_r_tilde, *d_upsilon; 122 | gpuErrchk(cudaMalloc(&d_r, state_size*knot_points*sizeof(T))); 123 | gpuErrchk(cudaMalloc(&d_p, state_size*knot_points*sizeof(T))); 124 | gpuErrchk(cudaMalloc(&d_v_temp, knot_points*sizeof(T))); 125 | gpuErrchk(cudaMalloc(&d_eta_new_temp, knot_points*sizeof(T))); 126 | 127 | 128 | 129 | void *pcg_kernel = (void *) pcg; 130 | uint32_t pcg_iters; 131 | uint32_t *d_pcg_iters; 132 | gpuErrchk(cudaMalloc(&d_pcg_iters, sizeof(uint32_t))); 133 | bool pcg_exit; 134 | bool *d_pcg_exit; 135 | gpuErrchk(cudaMalloc(&d_pcg_exit, sizeof(bool))); 136 | 137 | void *pcgKernelArgs[] = { 138 | (void *)&d_S, 139 | (void *)&d_Pinv, 140 | (void *)&d_gamma, 141 | (void *)&d_lambda, 142 | (void *)&d_r, 143 | (void *)&d_p, 144 | (void *)&d_v_temp, 145 | (void *)&d_eta_new_temp, 146 | (void *)&d_pcg_iters, 147 | (void *)&d_pcg_exit, 148 | (void *)&config.pcg_max_iter, 149 | (void *)&config.pcg_exit_tol 150 | }; 151 | size_t ppcg_kernel_smem_size = pcgSharedMemSize(state_size, knot_points); 152 | 153 | 154 | gpuErrchk(cudaPeekAtLastError()); 155 | gpuErrchk(cudaDeviceSynchronize()); 156 | 157 | #if TIME_LINSYS 158 | struct timespec linsys_start, linsys_end; 159 | double linsys_time; 160 | #endif 161 | #if CONST_UPDATE_FREQ 162 | struct timespec sqp_cur; 163 | auto sqpTimecheck = [&]() { 164 | clock_gettime(CLOCK_MONOTONIC, &sqp_cur); 165 | return time_delta_us_timespec(sqp_solve_start,sqp_cur) > SQP_MAX_TIME_US; 166 | }; 167 | #else 168 | auto sqpTimecheck = [&]() { return false; }; 169 | #endif 170 | 171 | 172 | ///TODO: atomic race conditions here aren't fixed but don't seem to be problematic 173 | compute_merit<<>>( 174 | state_size, control_size, knot_points, 175 | d_xu, 176 | d_eePos_traj, 177 | static_cast(10), 178 | timestep, 179 | d_dynMem_const, 180 | d_merit_initial 181 | ); 182 | gpuErrchk(cudaMemcpyAsync(&h_merit_initial, d_merit_initial, sizeof(T), cudaMemcpyDeviceToHost)); 183 | gpuErrchk(cudaPeekAtLastError()); 184 | 185 | // 186 | // SQP LOOP 187 | // 188 | for(uint32_t sqpiter = 0; sqpiter < SQP_MAX_ITER; sqpiter++){ 189 | 190 | generate_kkt_submatrices<<(state_size, control_size)>>>( 191 | state_size, 192 | control_size, 193 | knot_points, 194 | d_G_dense, 195 | d_C_dense, 196 | d_g, 197 | d_c, 198 | d_dynMem_const, 199 | timestep, 200 | d_eePos_traj, 201 | d_xs, 202 | d_xu 203 | ); 204 | gpuErrchk(cudaPeekAtLastError()); 205 | if (sqpTimecheck()){ break; } 206 | 207 | form_schur_system( 208 | state_size, 209 | control_size, 210 | knot_points, 211 | d_G_dense, 212 | d_C_dense, 213 | d_g, 214 | d_c, 215 | d_S, 216 | d_Pinv, 217 | d_gamma, 218 | rho 219 | ); 220 | gpuErrchk(cudaPeekAtLastError()); 221 | if (sqpTimecheck()){ break; } 222 | 223 | 224 | #if TIME_LINSYS 225 | gpuErrchk(cudaDeviceSynchronize()); 226 | if (sqpTimecheck()){ break; } 227 | clock_gettime(CLOCK_MONOTONIC,&linsys_start); 228 | #endif // #if TIME_LINSYS 229 | 230 | gpuErrchk(cudaLaunchCooperativeKernel(pcg_kernel, knot_points, PCG_NUM_THREADS, pcgKernelArgs, ppcg_kernel_smem_size)); 231 | gpuErrchk(cudaMemcpy(&pcg_iters, d_pcg_iters, sizeof(uint32_t), cudaMemcpyDeviceToHost)); 232 | gpuErrchk(cudaMemcpy(&pcg_exit, d_pcg_exit, sizeof(bool), cudaMemcpyDeviceToHost)); 233 | gpuErrchk(cudaPeekAtLastError()); 234 | 235 | #if TIME_LINSYS 236 | gpuErrchk(cudaDeviceSynchronize()); 237 | clock_gettime(CLOCK_MONOTONIC,&linsys_end); 238 | 239 | linsys_time = time_delta_us_timespec(linsys_start,linsys_end); 240 | linsys_time_vec.push_back(linsys_time); 241 | #endif // #if TIME_LINSYS 242 | 243 | pcg_iter_vec.push_back(pcg_iters); 244 | pcg_exit_vec.push_back(pcg_exit); 245 | 246 | 247 | if (sqpTimecheck()){ break; } 248 | 249 | // recover dz 250 | compute_dz( 251 | state_size, 252 | control_size, 253 | knot_points, 254 | d_Ginv_dense, 255 | d_C_dense, 256 | d_g, 257 | d_lambda, 258 | d_dz 259 | ); 260 | gpuErrchk(cudaPeekAtLastError()); 261 | if (sqpTimecheck()){ break; } 262 | 263 | 264 | // line search 265 | for(uint32_t p = 0; p < num_alphas; p++){ 266 | void *kernelArgs[] = { 267 | (void *)&state_size, 268 | (void *)&control_size, 269 | (void *)&knot_points, 270 | (void *)&d_xs, 271 | (void *)&d_xu, 272 | (void *)&d_eePos_traj, 273 | (void *)&mu, 274 | (void *)×tep, 275 | (void *)&d_dynMem_const, 276 | (void *)&d_dz, 277 | (void *)&p, 278 | (void *)&d_merit_news, 279 | (void *)&d_merit_temp 280 | }; 281 | gpuErrchk(cudaLaunchCooperativeKernel(ls_merit_kernel, knot_points, MERIT_THREADS, kernelArgs, get_merit_smem_size(state_size, knot_points), streams[p])); 282 | } 283 | if (sqpTimecheck()){ break; } 284 | gpuErrchk(cudaPeekAtLastError()); 285 | gpuErrchk(cudaDeviceSynchronize()); 286 | 287 | 288 | cudaMemcpy(h_merit_news, d_merit_news, 8*sizeof(T), cudaMemcpyDeviceToHost); 289 | if (sqpTimecheck()){ break; } 290 | 291 | 292 | line_search_step = 0; 293 | min_merit = h_merit_initial; 294 | for(int i = 0; i < 8; i++){ 295 | // std::cout << h_merit_news[i] << (i == 7 ? "\n" : " "); 296 | ///TODO: reduction ratio 297 | if(h_merit_news[i] < min_merit){ 298 | min_merit = h_merit_news[i]; 299 | line_search_step = i; 300 | } 301 | } 302 | 303 | 304 | if(min_merit == h_merit_initial){ 305 | // line search failure 306 | drho = max(drho*rho_factor, rho_factor); 307 | rho = max(rho*drho, rho_min); 308 | sqp_iter++; 309 | if(rho > rho_max){ 310 | sqp_time_exit = 0; 311 | rho = rho_reset; 312 | break; 313 | } 314 | continue; 315 | } 316 | // std::cout << "line search accepted\n"; 317 | alphafinal = -1.0 / (1 << line_search_step); // alpha sign 318 | 319 | drho = min(drho/rho_factor, 1/rho_factor); 320 | rho = max(rho*drho, rho_min); 321 | 322 | 323 | #if USE_DOUBLES 324 | cublasDaxpy( 325 | handle, 326 | DZ_SIZE_BYTES / sizeof(T), 327 | &alphafinal, 328 | d_dz, 1, 329 | d_xu, 1 330 | ); 331 | #else 332 | cublasSaxpy( 333 | handle, 334 | DZ_SIZE_BYTES / sizeof(T), 335 | &alphafinal, 336 | d_dz, 1, 337 | d_xu, 1 338 | ); 339 | #endif 340 | 341 | gpuErrchk(cudaPeekAtLastError()); 342 | // if success increment after update 343 | sqp_iter++; 344 | 345 | if (sqpTimecheck()){ break; } 346 | 347 | 348 | delta_merit_iter = h_merit_initial - min_merit; 349 | delta_merit_total += delta_merit_iter; 350 | 351 | 352 | h_merit_initial = min_merit; 353 | 354 | } 355 | 356 | gpuErrchk(cudaPeekAtLastError()); 357 | gpuErrchk(cudaDeviceSynchronize()); 358 | clock_gettime(CLOCK_MONOTONIC, &sqp_solve_end); 359 | 360 | cublasDestroy(handle); 361 | 362 | for(uint32_t st=0; st < num_alphas; st++){ 363 | gpuErrchk(cudaStreamDestroy(streams[st])); 364 | } 365 | 366 | 367 | 368 | 369 | gpuErrchk(cudaFree(d_merit_initial)); 370 | gpuErrchk(cudaFree(d_merit_news)); 371 | gpuErrchk(cudaFree(d_merit_temp)); 372 | gpuErrchk(cudaFree(d_G_dense)); 373 | gpuErrchk(cudaFree(d_C_dense)); 374 | gpuErrchk(cudaFree(d_g)); 375 | gpuErrchk(cudaFree(d_c)); 376 | gpuErrchk(cudaFree(d_S)); 377 | gpuErrchk(cudaFree(d_gamma)); 378 | gpuErrchk(cudaFree(d_dz)); 379 | gpuErrchk(cudaFree(d_xs)); 380 | gpuErrchk(cudaFree(d_pcg_iters)); 381 | gpuErrchk(cudaFree(d_pcg_exit)); 382 | gpuErrchk(cudaFree(d_Pinv)); 383 | gpuErrchk(cudaFree(d_r)); 384 | gpuErrchk(cudaFree(d_p)); 385 | gpuErrchk(cudaFree(d_v_temp)); 386 | gpuErrchk(cudaFree(d_eta_new_temp)); 387 | 388 | 389 | 390 | double sqp_solve_time = time_delta_us_timespec(sqp_solve_start, sqp_solve_end); 391 | 392 | return std::make_tuple(pcg_iter_vec, linsys_time_vec, sqp_solve_time, sqp_iter, sqp_time_exit, pcg_exit_vec); 393 | } 394 | -------------------------------------------------------------------------------- /include/qdldl/linsys_setup.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "gpuassert.cuh" 4 | #include "glass.cuh" 5 | #include "dynamics/rbd_plant.cuh" 6 | #include "merit.cuh" 7 | #include "utils/matrix.cuh" 8 | #include "utils/csr.cuh" 9 | #include "integrator.cuh" 10 | #include "qdldl.h" 11 | 12 | template 13 | __global__ 14 | void form_schur_qdl_kernel(uint32_t state_size, 15 | uint32_t control_size, 16 | uint32_t knot_points, 17 | T *d_G, 18 | T *d_C, 19 | T *d_g, 20 | T *d_c, 21 | QDLDL_float *d_val, 22 | T *d_gamma, 23 | T rho) 24 | { 25 | 26 | 27 | 28 | extern __shared__ T s_temp[ ]; 29 | const uint32_t states_sq = state_size*state_size; 30 | const uint32_t states_p_controls = state_size * control_size; 31 | const uint32_t controls_sq = control_size * control_size; 32 | const uint32_t states_s_controls = state_size + control_size; 33 | 34 | 35 | for(unsigned blockrow=blockIdx.x; blockrow(s_Q0, state_size, rho); 75 | add_identity(s_QN, state_size, rho); 76 | 77 | __syncthreads();//---------------------------------------------------------------- 78 | 79 | // SHARED MEMORY STATE 80 | // | Q_N | . | q_N | Q_0 | . | q_0 | scatch 81 | 82 | __syncthreads();//---------------------------------------------------------------- 83 | 84 | 85 | // invert Q_N, Q_0 86 | loadIdentity( state_size,state_size,s_Q0_i, s_QN_i); 87 | __syncthreads();//---------------------------------------------------------------- 88 | invertMatrix( state_size,state_size,state_size,s_Q0, s_QN, s_extra_temp); 89 | 90 | __syncthreads();//---------------------------------------------------------------- 91 | 92 | 93 | // SHARED MEMORY STATE 94 | // | . | Q_N_i | q_N | . | Q_0_i | q_0 | scatch 95 | 96 | 97 | // compute gamma 98 | mat_vec_prod( state_size, state_size, 99 | s_Q0_i, 100 | s_q0, 101 | s_gamma_k 102 | ); 103 | __syncthreads();//---------------------------------------------------------------- 104 | 105 | // save -Q0_i in spot 00 in S 106 | store_block_csr_lowertri(state_size, knot_points, s_Q0_i, d_val, 1, blockrow, -1); 107 | 108 | __syncthreads();//---------------------------------------------------------------- 109 | 110 | 111 | // compute Q0^{-1}q0 112 | mat_vec_prod( state_size, state_size, 113 | s_Q0_i, 114 | s_q0, 115 | s_Q0 116 | ); 117 | __syncthreads();//---------------------------------------------------------------- 118 | 119 | 120 | // SHARED MEMORY STATE 121 | // | . | Q_N_i | q_N | Q0^{-1}q0 | Q_0_i | q_0 | scatch 122 | 123 | 124 | // save -Q0^{-1}q0 in spot 0 in gamma 125 | for(unsigned ind = threadIdx.x; ind < state_size; ind += blockDim.x){ 126 | d_gamma[ind] = -s_Q0[ind]; 127 | } 128 | __syncthreads();//---------------------------------------------------------------- 129 | 130 | } 131 | else{ // blockrow!=LEAD_BLOCK 132 | 133 | 134 | const unsigned C_set_size = states_sq+states_p_controls; 135 | const unsigned G_set_size = states_sq+controls_sq; 136 | 137 | // NON-LEADING BLOCK GOAL SHARED MEMORY STATE 138 | // ...gamma_k | A_k | B_k | . | Q_k_I | . | Q_k+1_I | . | R_k_I | q_k | q_k+1 | r_k | integrator_error | extra_temp 139 | // s^2 s*c s^2 s^2 s^2 s^2 s^2 s^2 s s s s (s_Qk, state_size, rho); 172 | add_identity(s_Qkp1, state_size, rho); 173 | add_identity(s_Rk, control_size, rho); 174 | 175 | // Invert Q, Qp1, R 176 | loadIdentity( state_size,state_size,control_size, 177 | s_Qk_i, 178 | s_Qkp1_i, 179 | s_Rk_i 180 | ); 181 | __syncthreads();//---------------------------------------------------------------- 182 | invertMatrix( state_size,state_size,control_size,state_size, 183 | s_Qk, 184 | s_Qkp1, 185 | s_Rk, 186 | s_extra_temp 187 | ); 188 | __syncthreads();//---------------------------------------------------------------- 189 | 190 | // save Qk_i into G (now Ginv) for calculating dz 191 | gato_memcpy( 192 | d_G+(blockrow-1)*G_set_size, 193 | s_Qk_i, 194 | states_sq 195 | ); 196 | 197 | // save Rk_i into G (now Ginv) for calculating dz 198 | gato_memcpy( 199 | d_G+(blockrow-1)*G_set_size+states_sq, 200 | s_Rk_i, 201 | controls_sq 202 | ); 203 | 204 | if(blockrow==knot_points-1){ 205 | // save Qkp1_i into G (now Ginv) for calculating dz 206 | gato_memcpy( 207 | d_G+(blockrow)*G_set_size, 208 | s_Qkp1_i, 209 | states_sq 210 | ); 211 | } 212 | __syncthreads();//---------------------------------------------------------------- 213 | 214 | // Compute -AQ^{-1} in phi 215 | glass::gemm( 216 | state_size, 217 | state_size, 218 | state_size, 219 | static_cast(1.0), 220 | s_Ak, 221 | s_Qk_i, 222 | s_phi_k 223 | ); 224 | 225 | __syncthreads();//---------------------------------------------------------------- 226 | 227 | // Compute -BR^{-1} in Qkp1 228 | glass::gemm( 229 | state_size, 230 | control_size, 231 | control_size, 232 | static_cast(1.0), 233 | s_Bk, 234 | s_Rk_i, 235 | s_Qkp1 236 | ); 237 | 238 | __syncthreads();//---------------------------------------------------------------- 239 | 240 | // compute Q_{k+1}^{-1}q_{k+1} - IntegratorError in gamma 241 | mat_vec_prod( state_size, state_size, 242 | s_Qkp1_i, 243 | s_qkp1, 244 | s_gamma_k 245 | ); 246 | for(unsigned i = threadIdx.x; i < state_size; i += blockDim.x){ 247 | s_gamma_k[i] -= d_c[(blockrow*state_size)+i]; 248 | } 249 | __syncthreads();//---------------------------------------------------------------- 250 | 251 | // compute -AQ^{-1}q for gamma temp storage in extra temp 252 | mat_vec_prod( state_size, state_size, 253 | s_phi_k, 254 | s_qk, 255 | s_extra_temp 256 | ); 257 | 258 | 259 | __syncthreads();//---------------------------------------------------------------- 260 | 261 | // compute -BR^{-1}r for gamma temp storage in extra temp + states 262 | mat_vec_prod( state_size, control_size, 263 | s_Qkp1, 264 | s_rk, 265 | s_extra_temp + state_size 266 | ); 267 | 268 | __syncthreads();//---------------------------------------------------------------- 269 | 270 | // gamma = yeah... 271 | for(unsigned i = threadIdx.x; i < state_size; i += blockDim.x){ 272 | s_gamma_k[i] += s_extra_temp[state_size + i] + s_extra_temp[i]; 273 | } 274 | __syncthreads();//---------------------------------------------------------------- 275 | 276 | // compute AQ^{-1}AT - Qkp1^{-1} for theta 277 | glass::gemm( 278 | state_size, 279 | state_size, 280 | state_size, 281 | static_cast(1.0), 282 | s_phi_k, 283 | s_Ak, 284 | s_theta_k 285 | ); 286 | 287 | __syncthreads();//---------------------------------------------------------------- 288 | 289 | 290 | for(unsigned i = threadIdx.x; i < states_sq; i += blockDim.x){ 291 | s_theta_k[i] += s_Qkp1_i[i]; 292 | } 293 | 294 | __syncthreads();//---------------------------------------------------------------- 295 | 296 | // compute BR^{-1}BT for theta temp storage in QKp1{-1} 297 | glass::gemm( 298 | state_size, 299 | control_size, 300 | state_size, 301 | static_cast(1.0), 302 | s_Qkp1, 303 | s_Bk, 304 | s_Qkp1_i 305 | ); 306 | 307 | __syncthreads();//---------------------------------------------------------------- 308 | 309 | for(unsigned i = threadIdx.x; i < states_sq; i += blockDim.x){ 310 | s_theta_k[i] += s_Qkp1_i[i]; 311 | } 312 | __syncthreads();//---------------------------------------------------------------- 313 | 314 | // // save phi_k into left off-diagonal of S, 315 | store_block_csr_lowertri(state_size, knot_points, s_phi_k, d_val, 0, blockrow, -1); 316 | 317 | __syncthreads();//---------------------------------------------------------------- 318 | 319 | 320 | // save -s_theta_k main diagonal S 321 | store_block_csr_lowertri(state_size, knot_points, s_theta_k, d_val, 1, blockrow, -1); 322 | 323 | __syncthreads();//---------------------------------------------------------------- 324 | 325 | // save gamma_k in gamma 326 | for(unsigned ind = threadIdx.x; ind < state_size; ind += blockDim.x){ 327 | unsigned offset = (blockrow)*state_size + ind; 328 | d_gamma[offset] = s_gamma_k[ind]*-1; 329 | } 330 | 331 | __syncthreads();//---------------------------------------------------------------- 332 | 333 | } 334 | 335 | } 336 | } 337 | 338 | template 339 | void form_schur_system_qdldl(uint32_t state_size, uint32_t control_size, uint32_t knot_points, 340 | T *d_G_dense, T *d_C_dense, T *d_g, T *d_c, 341 | QDLDL_float *d_val, T *d_gamma, 342 | T rho) 343 | { 344 | const uint32_t s_temp_size =sizeof(T)*(8 * state_size*state_size+ 345 | 7 * state_size+ 346 | state_size * control_size+ 347 | 3 * control_size + 2 * control_size * control_size + 3); 348 | 349 | // form Schur, Pinv 350 | form_schur_qdl_kernel<<>>(state_size, control_size, knot_points, d_G_dense, d_C_dense, d_g, d_c, d_val, d_gamma, rho); 351 | 352 | } -------------------------------------------------------------------------------- /include/qdldl/sqp.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "qdldl.h" 15 | #include "qdldl/linsys_setup.cuh" 16 | #include "merit.cuh" 17 | #include "settings.cuh" 18 | #include "kkt.cuh" 19 | #include "dz.cuh" 20 | 21 | 22 | __host__ 23 | void qdldl_solve_schur(const QDLDL_int An, 24 | QDLDL_int *h_col_ptr, QDLDL_int *h_row_ind, QDLDL_float *Ax, QDLDL_float *b, 25 | QDLDL_float *h_lambda, 26 | QDLDL_int *Lp, QDLDL_int *Li, QDLDL_float *Lx, QDLDL_float *D, QDLDL_float *Dinv, QDLDL_int *Lnz, QDLDL_int *etree, QDLDL_bool *bwork, QDLDL_int *iwork, QDLDL_float *fwork){ 27 | 28 | 29 | 30 | 31 | 32 | QDLDL_int i; 33 | 34 | const QDLDL_int *Ap = h_col_ptr; 35 | const QDLDL_int *Ai = h_row_ind; 36 | 37 | //data for L and D factors 38 | QDLDL_int Ln = An; 39 | 40 | 41 | //Data for results of A\b 42 | QDLDL_float *x = h_lambda; 43 | 44 | QDLDL_factor(An,Ap,Ai,Ax,Lp,Li,Lx,D,Dinv,Lnz,etree,bwork,iwork,fwork); 45 | 46 | for(i=0;i < Ln; i++) x[i] = b[i]; 47 | 48 | QDLDL_solve(Ln,Lp,Li,Lx,Dinv,x); 49 | } 50 | 51 | 52 | template 53 | auto sqpSolveQdldl(uint32_t state_size, uint32_t control_size, uint32_t knot_points, float timestep, T *d_eePos_traj, T *d_lambda, T *d_xu, void *d_dynMem_const, T &rho, T rho_reset){ 54 | 55 | // data storage 56 | std::vector linsys_iter_vec; 57 | std::vector linsys_exit_vec; 58 | std::vector linsys_time_vec; 59 | bool sqp_time_exit = 1; // for data recording, not a flag 60 | 61 | 62 | 63 | // sqp timing 64 | struct timespec sqp_solve_start, sqp_solve_end; 65 | gpuErrchk(cudaDeviceSynchronize()); 66 | clock_gettime(CLOCK_MONOTONIC, &sqp_solve_start); 67 | 68 | 69 | const uint32_t states_sq = state_size*state_size; 70 | const uint32_t states_p_controls = state_size * control_size; 71 | const uint32_t controls_sq = control_size * control_size; 72 | const uint32_t states_s_controls = state_size + control_size; 73 | const uint32_t KKT_G_DENSE_SIZE_BYTES = static_cast(((states_sq+controls_sq)*knot_points-controls_sq)*sizeof(T)); 74 | const uint32_t KKT_C_DENSE_SIZE_BYTES = static_cast((states_sq+states_p_controls)*(knot_points-1)*sizeof(T)); 75 | const uint32_t KKT_g_SIZE_BYTES = static_cast(((state_size+control_size)*knot_points-control_size)*sizeof(T)); 76 | const uint32_t KKT_c_SIZE_BYTES = static_cast((state_size*knot_points)*sizeof(T)); 77 | const uint32_t DZ_SIZE_BYTES = static_cast((states_s_controls*knot_points-control_size)*sizeof(T)); 78 | 79 | 80 | // line search things 81 | const float mu = 10.0f; 82 | const uint32_t num_alphas = 8; 83 | T h_merit_news[num_alphas]; 84 | void *ls_merit_kernel = (void *) ls_gato_compute_merit; 85 | const size_t merit_smem_size = get_merit_smem_size(state_size, control_size); 86 | T h_merit_initial, min_merit; 87 | T alphafinal; 88 | T delta_merit_iter = 0; 89 | T delta_merit_total = 0; 90 | uint32_t line_search_step = 0; 91 | 92 | 93 | // streams n cublas init 94 | cudaStream_t streams[num_alphas]; 95 | for(uint32_t str = 0; str < num_alphas; str++){ 96 | cudaStreamCreate(&streams[str]); 97 | } 98 | gpuErrchk(cudaPeekAtLastError()); 99 | 100 | cublasHandle_t handle; 101 | if (cublasCreate(&handle) != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed\n"); exit(13); } 102 | gpuErrchk(cudaPeekAtLastError()); 103 | 104 | 105 | uint32_t sqp_iter = 0; 106 | 107 | 108 | 109 | T *d_merit_initial, *d_merit_news, *d_merit_temp, 110 | *d_G_dense, *d_C_dense, *d_g, *d_c, *d_Ginv_dense, 111 | *d_S, *d_gamma, 112 | *d_dz, 113 | *d_xs; 114 | 115 | 116 | T drho = 1.0; 117 | T rho_factor = RHO_FACTOR; 118 | T rho_max = RHO_MAX; 119 | T rho_min = RHO_MIN; 120 | 121 | 122 | 123 | 124 | gpuErrchk(cudaMalloc(&d_G_dense, KKT_G_DENSE_SIZE_BYTES)); 125 | gpuErrchk(cudaMalloc(&d_C_dense, KKT_C_DENSE_SIZE_BYTES)); 126 | gpuErrchk(cudaMalloc(&d_g, KKT_g_SIZE_BYTES)); 127 | gpuErrchk(cudaMalloc(&d_c, KKT_c_SIZE_BYTES)); 128 | d_Ginv_dense = d_G_dense; 129 | 130 | gpuErrchk(cudaMalloc(&d_S, 3*states_sq*knot_points*sizeof(T))); 131 | gpuErrchk(cudaMalloc(&d_gamma, state_size*knot_points*sizeof(T))); 132 | gpuErrchk(cudaPeekAtLastError()); 133 | 134 | 135 | gpuErrchk(cudaMalloc(&d_dz, DZ_SIZE_BYTES)); 136 | gpuErrchk(cudaMalloc(&d_xs, state_size*sizeof(T))); 137 | gpuErrchk(cudaMemcpy(d_xs, d_xu, state_size*sizeof(T), cudaMemcpyDeviceToDevice)); 138 | gpuErrchk(cudaMalloc(&d_merit_news, 8*sizeof(T))); 139 | gpuErrchk(cudaMalloc(&d_merit_temp, 8*knot_points*sizeof(T))); 140 | // linsys iterates 141 | 142 | gpuErrchk(cudaMalloc(&d_merit_initial, sizeof(T))); 143 | gpuErrchk(cudaMemset(d_merit_initial, 0, sizeof(T))); 144 | 145 | 146 | 147 | 148 | const int nnz = (knot_points-1)*states_sq + knot_points*(((state_size+1)*state_size)/2); 149 | 150 | QDLDL_float h_lambda[state_size*knot_points]; 151 | QDLDL_float h_gamma[state_size*knot_points]; 152 | QDLDL_int h_col_ptr[state_size*knot_points+1]; 153 | QDLDL_int h_row_ind[nnz]; 154 | QDLDL_float h_val[nnz]; 155 | 156 | QDLDL_int *d_row_ind, *d_col_ptr; 157 | QDLDL_float *d_val, *d_lambda_double; 158 | gpuErrchk(cudaMalloc(&d_col_ptr, (state_size*knot_points+1)*sizeof(QDLDL_int))); 159 | gpuErrchk(cudaMalloc(&d_row_ind, nnz*sizeof(QDLDL_int))); 160 | gpuErrchk(cudaMalloc(&d_val, nnz*sizeof(QDLDL_float))); 161 | gpuErrchk(cudaMalloc(&d_lambda_double, (state_size*knot_points)*sizeof(QDLDL_float))); 162 | 163 | // fill col ptr and row ind, these won't change 164 | prep_csr<<>>(state_size, knot_points, d_col_ptr, d_row_ind); 165 | gpuErrchk(cudaMemcpy(h_col_ptr, d_col_ptr, (state_size*knot_points+1)*sizeof(QDLDL_int), cudaMemcpyDeviceToHost)); 166 | gpuErrchk(cudaMemcpy(h_row_ind, d_row_ind, (nnz)*sizeof(QDLDL_int), cudaMemcpyDeviceToHost)); 167 | 168 | 169 | const QDLDL_int An = state_size*knot_points; 170 | 171 | // Q things 172 | QDLDL_int sumLnz; 173 | QDLDL_int *etree; 174 | QDLDL_int *Lnz; 175 | etree = (QDLDL_int*)malloc(sizeof(QDLDL_int)*An); 176 | Lnz = (QDLDL_int*)malloc(sizeof(QDLDL_int)*An); 177 | 178 | QDLDL_int *Lp; 179 | QDLDL_float *D; 180 | QDLDL_float *Dinv; 181 | Lp = (QDLDL_int*)malloc(sizeof(QDLDL_int)*(An+1)); 182 | D = (QDLDL_float*)malloc(sizeof(QDLDL_float)*An); 183 | Dinv = (QDLDL_float*)malloc(sizeof(QDLDL_float)*An); 184 | 185 | //working data for factorisation 186 | QDLDL_int *iwork; 187 | QDLDL_bool *bwork; 188 | QDLDL_float *fwork; 189 | iwork = (QDLDL_int*)malloc(sizeof(QDLDL_int)*(3*An)); 190 | bwork = (QDLDL_bool*)malloc(sizeof(QDLDL_bool)*An); 191 | fwork = (QDLDL_float*)malloc(sizeof(QDLDL_float)*An); 192 | 193 | sumLnz = QDLDL_etree(An,h_col_ptr,h_row_ind,iwork,Lnz,etree); 194 | 195 | QDLDL_int *Li; 196 | QDLDL_float *Lx; 197 | Li = (QDLDL_int*)malloc(sizeof(QDLDL_int)*sumLnz); 198 | Lx = (QDLDL_float*)malloc(sizeof(QDLDL_float)*sumLnz); 199 | 200 | gpuErrchk(cudaPeekAtLastError()); 201 | gpuErrchk(cudaDeviceSynchronize()); 202 | #if TIME_LINSYS == 1 203 | struct timespec linsys_start, linsys_end; 204 | double linsys_time; 205 | #endif 206 | #if CONST_UPDATE_FREQ 207 | struct timespec sqp_cur; 208 | auto sqpTimecheck = [&]() { 209 | clock_gettime(CLOCK_MONOTONIC, &sqp_cur); 210 | return time_delta_us_timespec(sqp_solve_start,sqp_cur) > SQP_MAX_TIME_US; 211 | }; 212 | #else 213 | auto sqpTimecheck = [&]() { return false; }; 214 | #endif 215 | 216 | 217 | ///TODO: atomic race conditions here aren't fixed but don't seem to be problematic 218 | compute_merit<<>>( 219 | state_size, control_size, knot_points, 220 | d_xu, 221 | d_eePos_traj, 222 | static_cast(10), 223 | timestep, 224 | d_dynMem_const, 225 | d_merit_initial 226 | ); 227 | gpuErrchk(cudaMemcpyAsync(&h_merit_initial, d_merit_initial, sizeof(T), cudaMemcpyDeviceToHost)); 228 | gpuErrchk(cudaPeekAtLastError()); 229 | 230 | // gpuErrchk(cudaDeviceSynchronize()); 231 | // std::cout << "initial merit " << h_merit_initial << std::endl; 232 | // exit(0); 233 | 234 | // 235 | // SQP LOOP 236 | // 237 | for(uint32_t sqpiter = 0; sqpiter < SQP_MAX_ITER; sqpiter++){ 238 | 239 | generate_kkt_submatrices<<(state_size, control_size)>>>( 240 | state_size, 241 | control_size, 242 | knot_points, 243 | d_G_dense, 244 | d_C_dense, 245 | d_g, 246 | d_c, 247 | d_dynMem_const, 248 | timestep, 249 | d_eePos_traj, 250 | d_xs, 251 | d_xu 252 | ); 253 | gpuErrchk(cudaPeekAtLastError()); 254 | if (sqpTimecheck()){ break; } 255 | 256 | 257 | form_schur_system_qdldl(state_size, control_size, knot_points, d_G_dense, d_C_dense, d_g, d_c, d_val, d_gamma, rho); 258 | gpuErrchk(cudaPeekAtLastError()); 259 | if (sqpTimecheck()){ break; } 260 | 261 | #if TIME_LINSYS == 1 262 | gpuErrchk(cudaDeviceSynchronize()); 263 | if (sqpTimecheck()){ break; } 264 | clock_gettime(CLOCK_MONOTONIC, &linsys_start); 265 | #endif // #if TIME_LINSYS 266 | 267 | 268 | gpuErrchk(cudaMemcpy(h_val, d_val, (nnz)*sizeof(T), cudaMemcpyDeviceToHost)); 269 | gpuErrchk(cudaMemcpy(h_gamma, d_gamma, (state_size*knot_points)*sizeof(T), cudaMemcpyDeviceToHost)) 270 | 271 | qdldl_solve_schur(An, h_col_ptr, h_row_ind, h_val, h_gamma, h_lambda, Lp, Li, Lx, D, Dinv, Lnz, etree, bwork, iwork, fwork); 272 | 273 | gpuErrchk(cudaMemcpy(d_lambda, h_lambda, (state_size*knot_points)*sizeof(T), cudaMemcpyHostToDevice)); 274 | 275 | 276 | #if TIME_LINSYS == 1 277 | gpuErrchk(cudaDeviceSynchronize()); 278 | clock_gettime(CLOCK_MONOTONIC, &linsys_end); 279 | 280 | linsys_time = time_delta_us_timespec(linsys_start, linsys_end); 281 | linsys_time_vec.push_back(linsys_time); 282 | #endif // #if TIME_LINSYS 283 | 284 | if (sqpTimecheck()){ break; } 285 | 286 | // recover dz 287 | compute_dz( 288 | state_size, 289 | control_size, 290 | knot_points, 291 | d_Ginv_dense, 292 | d_C_dense, 293 | d_g, 294 | d_lambda, 295 | d_dz 296 | ); 297 | gpuErrchk(cudaPeekAtLastError()); 298 | if (sqpTimecheck()){ break; } 299 | 300 | 301 | // line search 302 | for(uint32_t p = 0; p < num_alphas; p++){ 303 | void *kernelArgs[] = { 304 | (void *)&state_size, 305 | (void *)&control_size, 306 | (void *)&knot_points, 307 | (void *)&d_xs, 308 | (void *)&d_xu, 309 | (void *)&d_eePos_traj, 310 | (void *)&mu, 311 | (void *)×tep, 312 | (void *)&d_dynMem_const, 313 | (void *)&d_dz, 314 | (void *)&p, 315 | (void *)&d_merit_news, 316 | (void *)&d_merit_temp 317 | }; 318 | gpuErrchk(cudaLaunchCooperativeKernel(ls_merit_kernel, knot_points, MERIT_THREADS, kernelArgs, get_merit_smem_size(state_size, knot_points), streams[p])); 319 | } 320 | if (sqpTimecheck()){ break; } 321 | gpuErrchk(cudaPeekAtLastError()); 322 | gpuErrchk(cudaDeviceSynchronize()); 323 | 324 | 325 | cudaMemcpy(h_merit_news, d_merit_news, 8*sizeof(T), cudaMemcpyDeviceToHost); 326 | if (sqpTimecheck()){ break; } 327 | 328 | 329 | line_search_step = 0; 330 | min_merit = h_merit_initial; 331 | for(int i = 0; i < 8; i++){ 332 | // std::cout << h_merit_news[i] << (i == 7 ? "\n" : " "); 333 | ///TODO: reduction ratio 334 | if(h_merit_news[i] < min_merit){ 335 | min_merit = h_merit_news[i]; 336 | line_search_step = i; 337 | } 338 | } 339 | 340 | 341 | if(min_merit == h_merit_initial){ 342 | // line search failure 343 | drho = max(drho*rho_factor, rho_factor); 344 | rho = max(rho*drho, rho_min); 345 | sqp_iter++; 346 | if(rho > rho_max){ 347 | sqp_time_exit = 0; 348 | rho = rho_reset; 349 | break; 350 | } 351 | continue; 352 | } 353 | // std::cout << "line search accepted\n"; 354 | alphafinal = -1.0 / (1 << line_search_step); // alpha sign 355 | 356 | drho = min(drho/rho_factor, 1/rho_factor); 357 | rho = max(rho*drho, rho_min); 358 | 359 | 360 | #if USE_DOUBLES 361 | cublasDaxpy( 362 | handle, 363 | DZ_SIZE_BYTES / sizeof(T), 364 | &alphafinal, 365 | d_dz, 1, 366 | d_xu, 1 367 | ); 368 | #else 369 | cublasSaxpy( 370 | handle, 371 | DZ_SIZE_BYTES / sizeof(T), 372 | &alphafinal, 373 | d_dz, 1, 374 | d_xu, 1 375 | ); 376 | #endif 377 | 378 | gpuErrchk(cudaPeekAtLastError()); 379 | // if success increment after update 380 | sqp_iter++; 381 | 382 | if (sqpTimecheck()){ break; } 383 | 384 | 385 | delta_merit_iter = h_merit_initial - min_merit; 386 | delta_merit_total += delta_merit_iter; 387 | 388 | 389 | h_merit_initial = min_merit; 390 | 391 | } 392 | 393 | gpuErrchk(cudaPeekAtLastError()); 394 | gpuErrchk(cudaDeviceSynchronize()); 395 | clock_gettime(CLOCK_MONOTONIC, &sqp_solve_end); 396 | 397 | cublasDestroy(handle); 398 | 399 | for(uint32_t st=0; st < num_alphas; st++){ 400 | gpuErrchk(cudaStreamDestroy(streams[st])); 401 | } 402 | 403 | 404 | 405 | 406 | gpuErrchk(cudaFree(d_merit_initial)); 407 | gpuErrchk(cudaFree(d_merit_news)); 408 | gpuErrchk(cudaFree(d_merit_temp)); 409 | gpuErrchk(cudaFree(d_G_dense)); 410 | gpuErrchk(cudaFree(d_C_dense)); 411 | gpuErrchk(cudaFree(d_g)); 412 | gpuErrchk(cudaFree(d_c)); 413 | gpuErrchk(cudaFree(d_S)); 414 | gpuErrchk(cudaFree(d_gamma)); 415 | gpuErrchk(cudaFree(d_dz)); 416 | gpuErrchk(cudaFree(d_xs)); 417 | gpuErrchk(cudaFree(d_col_ptr)); 418 | gpuErrchk(cudaFree(d_row_ind)); 419 | gpuErrchk(cudaFree(d_val)); 420 | gpuErrchk(cudaFree(d_lambda_double)); 421 | free(etree); 422 | free(Lnz); 423 | free(Lp); 424 | free(D); 425 | free(Dinv); 426 | free(iwork); 427 | free(bwork); 428 | free(fwork); 429 | free(Li); 430 | free(Lx); 431 | 432 | double sqp_solve_time = time_delta_us_timespec(sqp_solve_start, sqp_solve_end); 433 | 434 | return std::make_tuple(linsys_iter_vec, linsys_time_vec, sqp_solve_time, sqp_iter, sqp_time_exit, linsys_exit_vec); 435 | } 436 | -------------------------------------------------------------------------------- /include/utils/csr.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include "glass.cuh" 5 | #include "qdldl.h" 6 | #include 7 | 8 | 9 | // fills in the values of the lower triangle of a symmetric block tridiagonal matrix 10 | template 11 | __device__ 12 | void store_block_csr_lowertri(uint32_t bdim, uint32_t mdim, T *d_src, QDLDL_float *d_val, bool col1, uint32_t bd_block_row, int32_t multiplier=1){ 13 | 14 | const int brow_val_ct = bdim*bdim + ((bdim+1)*bdim)/2; 15 | int row, col, csr_row_offset, full_csr_offset; 16 | int write_len; 17 | int cur_triangle_offset; 18 | 19 | for(row = threadIdx.x; row < bdim; row += blockDim.x){ 20 | 21 | 22 | cur_triangle_offset = ((row+1)*row)/2; 23 | csr_row_offset = (bd_block_row>0)*((bdim+1)*bdim)/2 + // add triangle if not first block row 24 | (bd_block_row>0) * (bd_block_row-1)*brow_val_ct + // add previous full block rows if not first block row 25 | (bd_block_row>0)*row*bdim + // 26 | cur_triangle_offset; // triangle offset 27 | 28 | 29 | write_len = (bd_block_row>0)*((!col1)*(bdim)+(col1)*(row+1)) + (col1)*(bd_block_row==0)*(row+1); 30 | 31 | for(col = 0; col0)*(col1)*bdim + col; 33 | d_val[full_csr_offset] = static_cast(d_src[row + col*bdim]) * multiplier; 34 | } 35 | } 36 | } 37 | 38 | 39 | // fills in the column pointers and row indices for the CSR representation of the lower triangle of a symmetric block tridiagonal matrix 40 | __global__ 41 | void prep_csr(uint32_t state_size, uint32_t knot_points, QDLDL_int *d_col_ptr, QDLDL_int *d_row_ind){ 42 | 43 | for (uint32_t blockrow = blockIdx.x; blockrow < knot_points; blockrow+=gridDim.x) 44 | { 45 | const int brow_val_ct = state_size*state_size + ((state_size+1)*state_size)/2; 46 | int row, col, csr_row_offset, full_csr_offset, bd_row_len; 47 | int cur_triangle_offset; 48 | 49 | for(row = threadIdx.x; row < state_size; row += blockDim.x){ 50 | 51 | 52 | if(blockrow==0 && row==0){ 53 | d_col_ptr[0] = 0; 54 | } 55 | 56 | cur_triangle_offset = ((row+1)*row)/2; 57 | csr_row_offset = (blockrow>0)*((state_size+1)*state_size)/2 + // add triangle if not first block row 58 | (blockrow>0) * (blockrow-1)*brow_val_ct + // add previous full block rows if not first block row 59 | (blockrow>0)*row*state_size + // 60 | cur_triangle_offset; // triangle offset 61 | 62 | 63 | bd_row_len = (blockrow>0)*state_size + row+1; 64 | d_col_ptr[blockrow*state_size + row+1] = csr_row_offset+bd_row_len; 65 | 66 | for(col = 0; col < bd_row_len; col++){ 67 | full_csr_offset = csr_row_offset + col; 68 | d_row_ind[full_csr_offset] = (blockrow>0)*(blockrow-1)*state_size + col; 69 | } 70 | 71 | } 72 | } 73 | 74 | } -------------------------------------------------------------------------------- /include/utils/experiment.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #define time_delta_us_timespec(start,end) (1e6*static_cast(end.tv_sec - start.tv_sec)+1e-3*static_cast(end.tv_nsec - start.tv_nsec)) 15 | 16 | template 17 | void printStats(std::vector *times){ 18 | double sum = std::accumulate(times->begin(), times->end(), 0.0); 19 | double mean = sum/static_cast(times->size()); 20 | std::vector diff(times->size()); 21 | std::transform(times->begin(), times->end(), diff.begin(), [mean](double x) {return x - mean;}); 22 | double sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); 23 | double stdev = std::sqrt(sq_sum / times->size()); 24 | std::vector::iterator minInd = std::min_element(times->begin(), times->end()); 25 | std::vector::iterator maxInd = std::max_element(times->begin(), times->end()); 26 | double min = times->at(std::distance(times->begin(), minInd)); 27 | double max = times->at(std::distance(times->begin(), maxInd)); 28 | printf("Average[%fus] Std Dev [%fus] Min [%fus] Max [%fus] \n",mean,stdev,min,max); 29 | if (PRINT_DISTRIBUTION){ 30 | double hist[] = {0,0,0,0,0,0,0}; 31 | for(int i = 0; i < times->size(); i++){ 32 | double value = times->at(i); 33 | if (value < mean - stdev){ 34 | if (value < mean - 2*stdev){ 35 | if (value < mean - 3*stdev){hist[0] += 1.0;} 36 | else{hist[1] += 1.0;} 37 | } 38 | else{hist[2] += 1.0;} 39 | } 40 | else if (value > mean + stdev){ 41 | if (value > mean + 2*stdev){ 42 | if (value > mean + 3*stdev){hist[6] += 1.0;} 43 | else{hist[5] += 1.0;} 44 | } 45 | else{hist[4] += 1.0;} 46 | } 47 | else{hist[3] += 1.0;} 48 | } 49 | for(int i = 0; i < 7; i++){hist[i] = (hist[i]/static_cast(times->size()))*100;} 50 | printf(" Distribution | -3 | -2 | -1 | 0 | 1 | 2 | 3 |\n"); 51 | printf(" (X std dev) | %2.2f | %2.2f | %2.2f | %2.2f | %2.2f | %2.2f | %2.2f |\n", 52 | hist[0],hist[1],hist[2],hist[3],hist[4],hist[5],hist[6]); 53 | std::sort(times->begin(), times->end()); 54 | printf(" Percentiles | 50 | 60 | 70 | 75 | 80 | 85 | 90 | 95 | 99 |\n"); 55 | printf(" | %.2f | %.2f | %.2f | %.2f | %.2f | %.2f | %.2f | %.2f | %.2f |\n", 56 | times->at(times->size()/2),times->at(times->size()/5*3),times->at(times->size()/10*7), 57 | times->at(times->size()/4*3),times->at(times->size()/5*4),times->at(times->size()/20*17), 58 | times->at(times->size()/10*9),times->at(times->size()/20*19),times->at(times->size()/100*99)); 59 | bool onePer = false; bool twoPer = false; bool fivePer = false; bool tenPer = false; 60 | for(int i = 0; i < times->size(); i++){ 61 | if(!onePer && times->at(i) >= mean * 1.01){ onePer = true; 62 | printf(" More than 1 Percent above mean at [%2.2f] Percentile\n",static_cast(i)/static_cast(times->size())*100.0); 63 | } 64 | if(!twoPer && times->at(i) >= mean * 1.02){ twoPer = true; 65 | printf(" More than 2 Percent above mean at [%2.2f] Percentile\n",static_cast(i)/static_cast(times->size())*100.0); 66 | } 67 | if(!fivePer && times->at(i) >= mean * 1.05){ fivePer = true; 68 | printf(" More than 5 Percent above mean at [%2.2f] Percentile\n",static_cast(i)/static_cast(times->size())*100.0); 69 | } 70 | if(!tenPer && times->at(i) >= mean * 1.10){ tenPer = true; 71 | printf(" More than 10 Percent above mean at [%2.2f] Percentile\n",static_cast(i)/static_cast(times->size())*100.0); 72 | } 73 | } 74 | } 75 | } 76 | 77 | std::string getCurrentTimestamp() { 78 | time_t rawtime; 79 | struct tm * timeinfo; 80 | char buffer[80]; 81 | time(&rawtime); 82 | timeinfo = localtime(&rawtime); 83 | strftime(buffer, sizeof(buffer), "%Y%m%d_%H%M%S", timeinfo); 84 | std::string timestampStr(buffer); 85 | return timestampStr; 86 | } 87 | 88 | // Function to format stats string values into CSV format 89 | std::string getStatsString(const std::string& statsString) { 90 | std::stringstream ss(statsString); 91 | std::string token; 92 | std::string csvFormattedString; 93 | 94 | while (getline(ss, token, '[')) { 95 | if (getline(ss, token, ']')) { 96 | if (!csvFormattedString.empty()) { 97 | csvFormattedString += ","; 98 | } 99 | csvFormattedString += token; 100 | } 101 | } 102 | 103 | return csvFormattedString; 104 | } 105 | 106 | template 107 | std::string printStats(std::vector *data, std::string prefix = "data"){ 108 | T sum = std::accumulate(data->begin(), data->end(), static_cast(0)); 109 | float mean = sum/static_cast(data->size()); 110 | std::vector diff(data->size()); 111 | std::transform(data->begin(), data->end(), diff.begin(), [mean](T x) {return x - mean;}); 112 | T sq_sum = std::inner_product(diff.begin(), diff.end(), diff.begin(), 0.0); 113 | T stdev = std::sqrt(sq_sum / data->size()); 114 | typename std::vector::iterator minInd = std::min_element(data->begin(), data->end()); 115 | typename std::vector::iterator maxInd = std::max_element(data->begin(), data->end()); 116 | T min = data->at(std::distance(data->begin(), minInd)); 117 | T max = data->at(std::distance(data->begin(), maxInd)); 118 | 119 | // Now also want to sort and get median, first and third quartile for variance plot 120 | std::vector sortedData(*data); 121 | std::sort(sortedData.begin(), sortedData.end()); 122 | 123 | std::cout << std::endl; 124 | T median, Q1, Q3; 125 | size_t n = sortedData.size(); 126 | if (n % 2 == 0) { 127 | median = (sortedData[n/2 - 1] + sortedData[n/2]) / 2.0; 128 | Q1 = (sortedData[n/4 - 1] + sortedData[n/4]) / 2.0; 129 | Q3 = (sortedData[3*n/4 - 1] + sortedData[3*n/4]) / 2.0; 130 | } else { 131 | median = sortedData[n/2]; 132 | Q1 = sortedData[n/4]; 133 | Q3 = sortedData[3*n/4]; 134 | } 135 | std::cout << "Average[" << mean << "] Std Dev [" << stdev << "] Min [" << min << "] Max [" << max << "] Median [" << median << "] Q1 [" << Q1 << "] Q3 [" << Q3 << "]" << std::endl; 136 | 137 | // Construct the formatted string 138 | std::stringstream ss; 139 | ss << "Average[" << mean << "] Std Dev [" << stdev << "] Min [" << min << "] Max [" << max << "] Median [" << median << "] Q1 [" << Q1 << "] Q3 [" << Q3 << "]"; 140 | 141 | return ss.str(); 142 | } 143 | 144 | template 145 | std::vector> readCSVToVecVec(const std::string& filename) { 146 | std::vector> data; 147 | std::ifstream infile(filename); 148 | 149 | if (!infile.is_open()) { 150 | std::cerr << "File [ " << filename << " ] could not be opened!\n"; 151 | } else { 152 | std::string line; 153 | 154 | 155 | while (std::getline(infile, line)) { 156 | std::vector row; 157 | std::stringstream ss(line); 158 | std::string val; 159 | 160 | while (std::getline(ss, val, ',')) { 161 | row.push_back(std::stof(val)); 162 | } 163 | 164 | data.push_back(row); 165 | } 166 | } 167 | 168 | infile.close(); 169 | return data; 170 | } -------------------------------------------------------------------------------- /include/utils/matrix.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | // TODO: GBD-PCG utils include fix 4 | #include "utils.cuh" 5 | 6 | 7 | 8 | 9 | template 10 | __device__ 11 | void gato_ATx(T *out, T *mat, T *vec, int m, int n){ 12 | 13 | T res; 14 | int ind, thing; 15 | 16 | for(ind=threadIdx.x; ind < n; ind +=blockDim.x){ 17 | 18 | res = 0; 19 | for(thing=0; thing 28 | __device__ 29 | void gato_vec_dif(T *out, T *vec1, T *vec2, int size){ 30 | for(int i = threadIdx.x; i < size; i+= blockDim.x){ 31 | out[i] = vec1[i] - vec2[i]; 32 | } 33 | } 34 | 35 | template 36 | __device__ 37 | void gato_vec_sum(T *out, T *vec1, T *vec2, int size){ 38 | for(int i = threadIdx.x; i < size; i+= blockDim.x){ 39 | out[i] = vec1[i] + vec2[i]; 40 | } 41 | } 42 | 43 | 44 | template 45 | __device__ 46 | void mat_vec_prod(unsigned MAT_ROWS, unsigned MAT_COLS, T *mat, T *vec, T *out){ 47 | 48 | for(unsigned row=threadIdx.x; row(0); 50 | for (unsigned col = 0; col < MAT_COLS; col++){ 51 | res += mat[row + col*MAT_ROWS] * vec[col]; 52 | } 53 | out[row] = res; 54 | } 55 | } 56 | 57 | template 58 | __device__ 59 | void add_identity(T *A, unsigned dim, T factor){ 60 | for(unsigned i = threadIdx.x; i < dim*dim; i+=blockDim.x){ 61 | if(i/dim == i%dim){ A[i] += factor; } 62 | } 63 | } 64 | 65 | 66 | 67 | // load identity in so memory is [A | I] 68 | template 69 | __device__ __forceinline__ 70 | void loadIdentity(uint32_t DIM, T *A){ 71 | for (unsigned ind = threadIdx.x; ind < DIM*DIM; ind += blockDim.x){ 72 | unsigned r, c; 73 | r = ind % DIM; 74 | c = ind / DIM; 75 | A[ind] = static_cast(r == c); 76 | } 77 | } 78 | 79 | // load identity in so memory is [V | I] 80 | template 81 | __device__ __forceinline__ 82 | void loadIdentity(uint32_t DIMA, uint32_t DIMB, T *A, T *B){ 83 | for (unsigned ind = threadIdx.x; ind < DIMA*DIMA+DIMB*DIMB; ind += blockDim.x){ 84 | unsigned r, c, indAdj; T *V; 85 | if (ind < DIMA*DIMA){ 86 | indAdj = ind; 87 | r = indAdj % DIMA; c = indAdj/DIMA; V = A; 88 | } 89 | else { 90 | indAdj = ind - DIMA*DIMA; 91 | r = indAdj % DIMB; c = indAdj/DIMB; V = B; 92 | } 93 | V[indAdj] = static_cast(r == c); 94 | } 95 | } 96 | 97 | 98 | // load identity in so memory is [V | I] 99 | template 100 | __device__ __forceinline__ 101 | void loadIdentity(unsigned DIMA, unsigned DIMB, unsigned DIMC, T *A, T *B, T *C){ 102 | for (unsigned ind = threadIdx.x; ind < DIMA*DIMA+DIMB*DIMB+DIMC*DIMC; ind += blockDim.x){ 103 | unsigned r, c, indAdj; T *V; 104 | if (ind < DIMA*DIMA){ 105 | indAdj = ind; 106 | r = indAdj % DIMA; c = indAdj/DIMA; V = A; 107 | } 108 | else if (ind < DIMA*DIMA+DIMB*DIMB){ 109 | indAdj = ind - DIMA*DIMA; 110 | r = indAdj % DIMB; c = indAdj/DIMB; V = B; 111 | } 112 | else{ 113 | indAdj = ind - DIMA*DIMA - DIMB*DIMB; 114 | r = indAdj % DIMC; c = indAdj/DIMC; V = C; 115 | } 116 | V[indAdj] = static_cast(r == c); 117 | } 118 | } 119 | 120 | template 121 | __device__ 122 | void invertMatrix(uint32_t DIM, T *A, T *s_temp){ 123 | // we are going to guassian elimination walking down the matrix (assuming no leading 0s) 124 | // we therefore use the columns in order as the pivot column for each pivot we need to rescale 125 | // that row so that the pivot value (pv) is 1 THEN for all other row values (orv) we need to add a multiple 126 | // of the NEW pivot row value (prv) such that we transorm the other row pivot column value (orpcv) to 0 127 | // pr *= 1/pv orv -= orpcv*prv == orv -= orpcv*1/pv*prvOld 128 | for (unsigned pivRC = 0; pivRC < DIM; pivRC++){ 129 | unsigned pivColOffset = pivRC*DIM; 130 | // save the pivot and pivot column and row 131 | T pvInv = static_cast(1)/A[pivRC + pivColOffset]; 132 | for (unsigned ind = threadIdx.x; ind < 2*DIM+1; ind++){ 133 | unsigned AInd; 134 | if (ind < DIM){AInd = ind + pivColOffset;} 135 | else{AInd = pivRC + pivColOffset + (ind-DIM)*DIM;} 136 | s_temp[ind] = A[AInd]; 137 | } 138 | __syncthreads(); //---------------------- 139 | // make the pivot update 140 | for (unsigned ind = threadIdx.x; ind < DIM*(DIM+1); ind += blockDim.x){ 141 | unsigned row = ind % DIM; unsigned col = ind / DIM; unsigned colOffset = ind - row; 142 | // s_temp = orpcvs|prvOld 143 | if (row == pivRC){A[row + pivColOffset + colOffset] *= pvInv;} 144 | else{A[row + pivColOffset + colOffset] -= s_temp[row]*pvInv*s_temp[DIM+col];} 145 | } 146 | __syncthreads(); //---------------------- 147 | } 148 | } 149 | 150 | 151 | template 152 | __device__ 153 | void invertMatrix(unsigned DIMA, unsigned DIMB, unsigned MAX_DIM, T *A, T *B, T *s_temp){ 154 | 155 | // now we are going to guassian elimination walking down the matrix (assuming no leading 0s) 156 | // we therefore use the columns in order as the pivot column for each pivot we need to rescale 157 | // that row so that the pivot value (pv) is 1 THEN for all other row values (orv) we need to add a multiple 158 | // of the NEW pivot row value (prv) such that we transorm the other row pivot column value (orpcv) to 0 159 | // pr *= 1/pv orv -= orpcv*prv == orv -= orpcv*1/pv*prvOld 160 | T *s_memA = s_temp; T *s_memB = &s_memA[2*DIMA+1]; 161 | for (unsigned pivRC = 0; pivRC < MAX_DIM; pivRC++){ 162 | bool AActive = pivRC < DIMA; bool BActive = pivRC < DIMB; 163 | unsigned pivColOffsetA = pivRC*DIMA; unsigned pivColOffsetB = pivRC*DIMB; 164 | // save the pivot column and row 165 | for (unsigned ind = threadIdx.x; ind < MAX_DIM; ind++){ 166 | if (AActive && ind < DIMA){s_memA[ind] = A[ind + pivColOffsetA];} 167 | if (BActive && ind < DIMB){s_memB[ind] = B[ind + pivColOffsetB];} 168 | } 169 | for (unsigned ind = threadIdx.x; ind < MAX_DIM+1; ind++){ 170 | if (AActive && ind < DIMA+1){s_memA[ind + DIMA] = A[ind*DIMA + pivRC + pivColOffsetA];} 171 | if (BActive && ind < DIMB+1){s_memB[ind + DIMB] = B[ind*DIMB + pivRC + pivColOffsetB];} 172 | } 173 | __syncthreads(); //---------------------- 174 | // make the pivot update with s_mem = [colA,rowA,colB,rowB,colC,rowC] 175 | for (unsigned ind = threadIdx.x; ind < MAX_DIM*(MAX_DIM+1); ind += blockDim.x){ 176 | if (AActive && ind < DIMA*(DIMA+1)){ 177 | unsigned row = ind % DIMA; unsigned col = ind / DIMA; 178 | if (row == pivRC){A[pivColOffsetA + ind] /= s_memA[pivRC];} 179 | else{A[pivColOffsetA + ind] -= s_memA[row]/s_memA[pivRC]*s_memA[DIMA+col];} 180 | } 181 | if (BActive && ind < DIMB*(DIMB+1)){ 182 | unsigned row = ind % DIMB; unsigned col = ind / DIMB; 183 | if (row == pivRC){B[pivColOffsetB + ind] /= s_memB[pivRC];} 184 | else{B[pivColOffsetB + ind] -= s_memB[row]/s_memB[pivRC]*s_memB[DIMB+col];} 185 | } 186 | } 187 | __syncthreads(); //---------------------- 188 | } 189 | } 190 | 191 | // invert A,B,C assume memory for all is [V | VInv] where both are DIMxDIM and continguous 192 | // relies on s_temp of size [2*DIMA + 2*DIMB + 2*DIMC + 3] 193 | template 194 | __device__ 195 | void invertMatrix(unsigned DIMA, unsigned DIMB, unsigned DIMC, unsigned MAX_DIM, T *A, T *B, T *C, T *s_temp){ 196 | 197 | // now we are going to guassian elimination walking down the matrix (assuming no leading 0s) 198 | // we therefore use the columns in order as the pivot column for each pivot we need to rescale 199 | // that row so that the pivot value (pv) is 1 THEN for all other row values (orv) we need to add a multiple 200 | // of the NEW pivot row value (prv) such that we transorm the other row pivot column value (orpcv) to 0 201 | // pr *= 1/pv orv -= orpcv*prv == orv -= orpcv*1/pv*prvOld 202 | T *s_memA = s_temp; T *s_memB = &s_memA[2*DIMA+1]; T *s_memC = &s_memB[2*DIMB+1]; 203 | for (unsigned pivRC = 0; pivRC < MAX_DIM; pivRC++){ 204 | bool AActive = pivRC < DIMA; bool BActive = pivRC < DIMB; bool CActive = pivRC < DIMC; 205 | unsigned pivColOffsetA = pivRC*DIMA; unsigned pivColOffsetB = pivRC*DIMB; unsigned pivColOffsetC = pivRC*DIMC; 206 | // save the pivot column and row 207 | for (unsigned ind = threadIdx.x; ind < MAX_DIM; ind++){ 208 | if (AActive && ind < DIMA){s_memA[ind] = A[ind + pivColOffsetA];} 209 | if (BActive && ind < DIMB){s_memB[ind] = B[ind + pivColOffsetB];} 210 | if (CActive && ind < DIMC){s_memC[ind] = C[ind + pivColOffsetC];} 211 | } 212 | for (unsigned ind = threadIdx.x; ind < MAX_DIM+1; ind++){ 213 | if (AActive && ind < DIMA+1){s_memA[ind + DIMA] = A[ind*DIMA + pivRC + pivColOffsetA];} 214 | if (BActive && ind < DIMB+1){s_memB[ind + DIMB] = B[ind*DIMB + pivRC + pivColOffsetB];} 215 | if (CActive && ind < DIMC+1){s_memC[ind + DIMC] = C[ind*DIMC + pivRC + pivColOffsetC];} 216 | } 217 | __syncthreads(); //---------------------- 218 | // make the pivot update with s_mem = [colA,rowA,colB,rowB,colC,rowC] 219 | for (unsigned ind = threadIdx.x; ind < MAX_DIM*(MAX_DIM+1); ind += blockDim.x){ 220 | if (AActive && ind < DIMA*(DIMA+1)){ 221 | unsigned row = ind % DIMA; unsigned col = ind / DIMA; 222 | if (row == pivRC){A[pivColOffsetA + ind] /= s_memA[pivRC];} 223 | else{A[pivColOffsetA + ind] -= s_memA[row]/s_memA[pivRC]*s_memA[DIMA+col];} 224 | } 225 | if (BActive && ind < DIMB*(DIMB+1)){ 226 | unsigned row = ind % DIMB; unsigned col = ind / DIMB; 227 | if (row == pivRC){B[pivColOffsetB + ind] /= s_memB[pivRC];} 228 | else{B[pivColOffsetB + ind] -= s_memB[row]/s_memB[pivRC]*s_memB[DIMB+col];} 229 | } 230 | if (CActive && ind < DIMC*(DIMC+1)){ 231 | unsigned row = ind % DIMC; unsigned col = ind / DIMC; 232 | if (row == pivRC){C[pivColOffsetC + ind] /= s_memC[pivRC];} 233 | else{C[pivColOffsetC + ind] -= s_memC[row]/s_memC[pivRC]*s_memC[DIMC+col];} 234 | } 235 | } 236 | __syncthreads(); //---------------------- 237 | } 238 | } 239 | 240 | 241 | void write_device_matrix_to_file(float* d_matrix, int rows, int cols, const char* filename, int filesuffix = 0) { 242 | 243 | char fname[100]; 244 | snprintf(fname, sizeof(fname), "%s%d.txt", filename, filesuffix); 245 | 246 | // Allocate host memory for the matrix 247 | float* h_matrix = new float[rows * cols]; 248 | 249 | // Copy the data from the device to the host memory 250 | size_t pitch = cols * sizeof(float); 251 | cudaMemcpy2D(h_matrix, pitch, d_matrix, pitch, pitch, rows, cudaMemcpyDeviceToHost); 252 | 253 | // Write the data to a file in column-major order 254 | std::ofstream outfile(fname); 255 | if (outfile.is_open()) { 256 | for (int row = 0; row < rows; ++row) { 257 | for (int col = 0; col < cols; ++col) { 258 | outfile << std::setprecision(std::numeric_limits::max_digits10+1) << h_matrix[col * rows + row] << "\t"; 259 | } 260 | outfile << std::endl; 261 | } 262 | outfile.close(); 263 | } else { 264 | std::cerr << "Unable to open file: " << fname << std::endl; 265 | } 266 | 267 | // Deallocate host memory 268 | delete[] h_matrix; 269 | } --------------------------------------------------------------------------------