├── run ├── POST9.dat ├── POWV9.dat ├── evaluator └── evaluator.cpp ├── src ├── global.h ├── main.cpp ├── database.hpp ├── Lshape_route.hpp ├── graph.hpp ├── Lshape_route_detour.hpp ├── flute.hpp └── database_cuda.hpp ├── LICENSE └── README.md /run/POST9.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cuhk-eda/InstantGR/HEAD/run/POST9.dat -------------------------------------------------------------------------------- /run/POWV9.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cuhk-eda/InstantGR/HEAD/run/POWV9.dat -------------------------------------------------------------------------------- /run/evaluator: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cuhk-eda/InstantGR/HEAD/run/evaluator -------------------------------------------------------------------------------- /src/global.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | using namespace std; 22 | 23 | const bool LOG = true; 24 | int mode = 0; 25 | std::chrono::high_resolution_clock::time_point program_start; 26 | 27 | 28 | queue nets2output; 29 | char output_buffer[1000000000]; 30 | FILE *out_file; 31 | double input_time, output_time, Lshape_time, DAG_time; 32 | __managed__ double of_cost_scale = 1; 33 | 34 | inline double elapsed_time() { 35 | std::chrono::duration time_now = std::chrono::high_resolution_clock::now() - program_start; 36 | return time_now.count(); 37 | } 38 | 39 | void print_GPU_memory_usage() {// in Gigabytes 40 | size_t free_bytes, total_bytes; 41 | auto cuda_status = cudaMemGetInfo(&free_bytes, &total_bytes); 42 | assert(cuda_status == cudaSuccess); 43 | printf(" GPU memory consumption: %.2f GB", (total_bytes - free_bytes) / 1024.0 / 1024.0 / 1024.0); 44 | cout << endl; 45 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2024, CUHK EDA 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # InstantGR 2 | 3 | InstantGR is a GPU-accelerated global routing tool. 4 | 5 | Check out the following paper for more details. 6 | 7 | * Shiju Lin, Liang Xiao, Jinwei Liu and Evangeline Young, ["InstantGR: Scalable GPU Parallelization for Global Routing"](https://shijulin.github.io/files/1239_Final_Manuscript.pdf), ACM/IEEE International Conference on Computer-Aided Design (ICCAD), New Jersey, USA, Oct 27–31, 2024. 8 | 9 | ## Compile 10 | ```bash 11 | cd src 12 | nvcc main.cpp -o ../run/InstantGR -std=c++17 -x cu -O3 -arch=sm_80 13 | ``` 14 | You may want to change `-arch=sm_80` according to your GPU. For example, if you are running InstantGR on NVIDIA RTX 3090, you need to change it to `-arch=sm_86`. 15 | 16 | ## Run 17 | ```bash 18 | cd run 19 | ./InstantGR -cap -net -out 20 | ``` 21 | Example: 22 | ```bash 23 | ./InstantGR -cap ../benchmarks/mempool_tile_rank.cap -net ../benchmarks/mempool_tile_rank.net -out mempool_tile_rank.out 24 | ``` 25 | 26 | ## Evaluate 27 | ```bash 28 | cd run 29 | g++ -o evaluator evaluator.cpp -O3 -std=c++17 #compile the evaluator 30 | ./evaluator # run the evaluator 31 | ``` 32 | Example: 33 | ```bash 34 | ./evaluator ../benchmarks/mempool_tile_rank.cap ../benchmarks/mempool_tile_rank.net mempool_tile_rank.out 35 | ``` 36 | 37 | ## Benchmarks 38 | 39 | The ISPD2024 benchmarks can be downloaded [here](https://drive.google.com/drive/folders/1bon65UEAx8cjSvVhYJ-lgC8QMDX0fvUm). 40 | We provide a small case `mempool_tile_rank` in the folder `benchmarks` for simple testing. 41 | 42 | ## Contact 43 | [Shiju Lin](https://shijulin.github.io/) (email: sjlin@cse.cuhk.edu.hk) 44 | 45 | ## License 46 | BSD 3-Clause License 47 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include "Lshape_route.hpp" 2 | #include "Lshape_route_detour.hpp" 3 | #include "graph.hpp" 4 | #include "database.hpp" 5 | #include "database_cuda.hpp" 6 | 7 | void route() { 8 | build_cuda_database(); 9 | for(int i = 0; i < db::nets.size(); i++) 10 | if(db::nets[i].pins.size() == 1) nets2output.push(i); 11 | thread output_thread1(graph::output_nets); 12 | 13 | cudaFuncSetAttribute(graph::compute_presum, cudaFuncAttributeMaxDynamicSharedMemorySize, 120 * 1024); 14 | cudaFuncSetAttribute(graph::compute_presum_general, cudaFuncAttributeMaxDynamicSharedMemorySize, 60 * 1024); 15 | vector nets2route_all; 16 | nets2route_all.resize(nets.size()); 17 | for(int i = 0; i < nets.size(); i++) nets2route_all[i] = i; 18 | 19 | 20 | Lshape_route::Lshape_route(nets2route_all); 21 | if(LOG) graph::report_score(); 22 | output_thread1.join(); 23 | 24 | int of_threshold = 0; 25 | 26 | if(nets.size() > 20000000) mode = 1; 27 | 28 | graph::extract_congestionView<<>> (); 29 | graph::extract_congestionView_xsum<<>> (); 30 | graph::extract_congestionView_ysum<<>> (); 31 | auto of_nets = graph::ripup(of_threshold); 32 | graph::finish_nets(of_nets.second); 33 | thread output_thread2(graph::output_nets); 34 | Lshape_route_detour::Lshape_route_detour_wrap(of_nets.first); 35 | if(LOG) graph::report_score(); 36 | output_thread2.join(); 37 | graph::finish_nets(of_nets.first); 38 | graph::output_nets(); 39 | } 40 | 41 | void runtime_breakdown() { 42 | const int width = 20; 43 | double total_time = elapsed_time(); 44 | cout << fixed << setprecision(2); 45 | cout << endl << setw(width * 3) << setfill('-') << "-" << setfill(' ') << endl; 46 | cout << setw(width) << "Procedure" << setw(width) << "time (s)" << setw(width) << "percent (%)" << endl; 47 | cout << setw(width * 3) << setfill('.') << "." << setfill(' ') << endl; 48 | cout << setw(width) << "input" << setw(width) << input_time << setw(width) << input_time / total_time * 100 << endl; 49 | cout << setw(width) << "Lshape Route" << setw(width) << Lshape_time << setw(width) << Lshape_time / total_time * 100 << endl; 50 | cout << setw(width) << "DAG Route" << setw(width) << DAG_time << setw(width) << DAG_time / total_time * 100 << endl; 51 | cout << setw(width) << "total" << setw(width) << total_time << setw(width) << total_time / total_time * 100 << endl; 52 | cout << setw(width * 3) << setfill('-') << "-" << setfill(' ') << endl << endl; 53 | } 54 | 55 | 56 | 57 | int main(int argc, char *argv[]) { 58 | program_start = std::chrono::high_resolution_clock::now(); 59 | 60 | const int cap_file_idx = 2, net_file_idx = 4, out_file_idx = 6; 61 | db::read(argv[cap_file_idx], argv[net_file_idx]); 62 | out_file = fopen(argv[out_file_idx], "w"); 63 | readLUT("POWV9.dat", "POST9.dat"); 64 | route(); 65 | fclose(out_file); 66 | if(LOG) runtime_breakdown(); 67 | quick_exit(0); 68 | } -------------------------------------------------------------------------------- /src/database.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "global.h" 3 | #include "robin_hood.h" 4 | #include "flute.hpp" 5 | using namespace flute; 6 | 7 | namespace db { 8 | 9 | struct layer { 10 | string name; 11 | int dir;//0: horizontal/X; 1: vertical/Y 12 | double min_len; 13 | }; 14 | 15 | struct net { 16 | 17 | void init(int num, vector> &p, int minx, int maxx, int miny, int maxy); 18 | 19 | int hpwl, unfinished_subnet_count; 20 | string name; 21 | //vector> access_points; 22 | vector pins, extra_routes, subnets; 23 | }; 24 | 25 | const int MAX_NET_NUM = 60000000; 26 | 27 | int L, X, Y; 28 | double unit_length_wire_cost, unit_via_cost; 29 | vector unit_length_short_costs; 30 | vector x_edge_len, y_edge_len, dr_x, dr_y, dr2gr_x, dr2gr_y; 31 | vector layers(10); 32 | vector>> capacity; 33 | vector nets; 34 | 35 | void net::init(int num, vector> &access_points, int xmin, int xmax, int ymin, int ymax) { 36 | assert(pins.empty()); 37 | hpwl = xmax - xmin + ymax - ymin; 38 | int center_2x = xmin + xmax, center_2y = ymin + ymax; 39 | robin_hood::unordered_set p, p2D; 40 | 41 | vector enumerated_idx(num), best_idx(num); 42 | int min_diff = 1e9, min_metric = 0; 43 | function enumerate = [&] (int cur) { 44 | if(cur == num) { 45 | int diff = 0, metric = 0; 46 | for(int i = 0; i < num; i++) { 47 | int xi = access_points[i][enumerated_idx[i]] / Y % X, yi = access_points[i][enumerated_idx[i]] % Y; 48 | metric += abs(2 * xi - center_2x) + abs(2 * yi - center_2y); 49 | for(int j = 0; j < i; j++) { 50 | int xj = access_points[j][enumerated_idx[j]] / Y % X, yj = access_points[j][enumerated_idx[j]] % Y; 51 | diff += (xi != xj) + (yi != yj); 52 | } 53 | } 54 | if(diff < min_diff || (diff == min_diff && metric < min_metric)) { 55 | best_idx = enumerated_idx; 56 | min_diff = diff; 57 | min_metric = metric; 58 | 59 | } 60 | } else { 61 | for(int idx = 1; idx <= access_points[cur][0]; idx++) { 62 | enumerated_idx[cur] = idx; 63 | enumerate(cur + 1); 64 | } 65 | } 66 | }; 67 | 68 | if(num <= 4) { 69 | enumerate(0); 70 | for(int i = 0; i < num; i++) { 71 | p.insert(access_points[i][best_idx[i]]); 72 | p2D.insert(access_points[i][best_idx[i]] % (X * Y)); 73 | } 74 | } else { 75 | for(int i = 0; i < num; i++) { 76 | int min_dist = 1e9, selected_pin = -1; 77 | for(int j = 1; j <= access_points[i][0]; j++) { 78 | int pin = access_points[i][j], dist = abs(pin / Y % X * 2 - center_2x) + abs(pin % Y * 2 - center_2y); 79 | if(dist < min_dist) { 80 | min_dist = dist; 81 | selected_pin = pin; 82 | } 83 | } 84 | assert(selected_pin >= 0); 85 | p.insert(selected_pin); 86 | p2D.insert(selected_pin % (X * Y)); 87 | } 88 | } 89 | if(p2D.size() < p.size()) { 90 | robin_hood::unordered_map pmax, pmin; 91 | for(auto e : p) { 92 | int pos_2D = e % (X * Y); 93 | pmax[pos_2D] = pmax.count(pos_2D) ? max(pmax[pos_2D], e) : e; 94 | pmin[pos_2D] = pmin.count(pos_2D) ? min(pmin[pos_2D], e) : e; 95 | } 96 | for(auto e : p2D) { 97 | pins.emplace_back(pmax[e]); 98 | if(pmin[e] != pmax[e]) { 99 | extra_routes.emplace_back(pmin[e]); 100 | extra_routes.emplace_back(pmax[e]); 101 | } 102 | } 103 | } else { 104 | pins.reserve(p.size()); 105 | for(auto e : p) pins.emplace_back(e); 106 | } 107 | } 108 | 109 | 110 | 111 | void read(char cap_file_name[], char net_file_name[]) { 112 | double input_start_time = elapsed_time(); 113 | 114 | for(int i = 0; i < 10; i++) layers[i].name = "metal" + to_string(i + 1); 115 | auto read_int = [&] (vector &buffer, size_t &buf_pt) { 116 | int ans = 0; 117 | while(!isdigit(buffer[buf_pt])) buf_pt++; 118 | for(; isdigit(buffer[buf_pt]); buf_pt++) ans = ans * 10 + buffer[buf_pt] - '0'; 119 | return ans; 120 | }; 121 | auto read_double = [&] (vector &buffer, size_t &buf_pt) { 122 | double ans = 0, scale = 0; 123 | while(!isdigit(buffer[buf_pt])) buf_pt++; 124 | for(; isdigit(buffer[buf_pt]) || buffer[buf_pt] == '.'; buf_pt++) 125 | if(buffer[buf_pt] == '.') 126 | scale = 1; 127 | else 128 | scale *= 10, ans = ans * 10 + buffer[buf_pt] - '0'; 129 | return scale == 0 ? ans : ans / scale; 130 | }; 131 | auto read_string = [&] (vector &buffer, size_t &buf_pt, string &str) { 132 | while(buffer[buf_pt] == ' ' || buffer[buf_pt] == '\n') buf_pt++; 133 | size_t beg = buf_pt; 134 | while(buffer[buf_pt] != ' ' && buffer[buf_pt] != '\n') buf_pt++; 135 | str.resize(buf_pt - beg); 136 | for(size_t i = 0; i < str.size(); i++) str[i] = buffer[beg + i]; 137 | }; 138 | 139 | auto read_cap = [&] (char cap_file_name[]) { 140 | double cap_start_time = elapsed_time(); 141 | std::ifstream cap_file(string(cap_file_name), std::ios::ate); 142 | if(!cap_file.good()) cout << "failed to open the cap file" << endl; 143 | size_t fsize = cap_file.tellg(); 144 | cap_file.seekg(0, std::ios::beg); 145 | std::vector buffer(fsize + 1); 146 | cap_file.read(buffer.data(), fsize); 147 | buffer[fsize] = 0; 148 | size_t buf_pt = 0; 149 | 150 | L = read_int(buffer, buf_pt); 151 | X = read_int(buffer, buf_pt); 152 | Y = read_int(buffer, buf_pt); 153 | unit_length_wire_cost = read_double(buffer, buf_pt); 154 | unit_via_cost = read_double(buffer, buf_pt); 155 | unit_length_short_costs.resize(L); 156 | assert(L == 10); 157 | capacity = vector>> (L, vector> (X, vector (Y))); 158 | for(int i = 0; i < L; i++) unit_length_short_costs[i] = read_double(buffer, buf_pt); 159 | x_edge_len.resize(X - 1); 160 | y_edge_len.resize(Y - 1); 161 | 162 | 163 | for(int i = 0; i < X - 1; i++) x_edge_len[i] = read_int(buffer, buf_pt); 164 | for(int i = 0; i < Y - 1; i++) y_edge_len[i] = read_int(buffer, buf_pt); 165 | dr_x = vector (X, 0); 166 | dr_y = vector (Y, 0); 167 | for(int i = 0; i < X - 1; i++) dr_x[i + 1] = x_edge_len[i] + dr_x[i]; 168 | for(int i = 0; i < Y - 1; i++) dr_y[i + 1] = y_edge_len[i] + dr_y[i]; 169 | dr2gr_x = vector (dr_x.back() + 1, -1); 170 | dr2gr_y = vector (dr_y.back() + 1, -1); 171 | for(int i = 0; i < X; i++) dr2gr_x[dr_x[i]] = i; 172 | for(int i = 0; i < Y; i++) dr2gr_y[dr_y[i]] = i; 173 | 174 | for(int l = 0; l < L; l++) { 175 | string name; 176 | read_string(buffer, buf_pt, name); 177 | assert(name == layers[l].name); 178 | layers[l].dir = read_int(buffer, buf_pt); 179 | layers[l].min_len = read_double(buffer, buf_pt); 180 | fflush(stdout); 181 | for(int y = 0; y < Y; y++) 182 | for(int x = 0; x < X; x++) 183 | capacity[l][x][y] = read_double(buffer, buf_pt); 184 | if(l) assert(layers[l].dir != layers[l - 1].dir); 185 | } 186 | 187 | printf("[%5.1f] read cap file done: duration=%.2fs", elapsed_time(), elapsed_time() - cap_start_time); 188 | cout << endl; 189 | }; 190 | 191 | 192 | auto read_net = [&] (char net_file_name[]) { 193 | double net_start_time = elapsed_time(); 194 | std::ifstream net_file(string(net_file_name), std::ios::ate); 195 | if(!net_file.good()) throw std::invalid_argument("failed to open the file '"s + cap_file_name + '\''); 196 | size_t fsize = net_file.tellg(); 197 | net_file.seekg(0, std::ios::beg); 198 | std::vector buffer(fsize + 1); 199 | net_file.read(buffer.data(), fsize); 200 | buffer[fsize] = 0; 201 | size_t buf_pt = 0; 202 | 203 | nets.reserve(600000000); 204 | 205 | vector> access_points(10000, vector (20)); 206 | 207 | while(1) {//reading a net 208 | nets.emplace_back(net()); 209 | read_string(buffer, buf_pt, nets.back().name); 210 | int minx = X, maxx = 0, miny = Y, maxy = 0, pin_id = 0; 211 | while(1) {//reading a pin (which may have multiple access points) 212 | if(pin_id >= 10000) access_points.emplace_back(vector (20)); 213 | access_points[pin_id][0] = 0; 214 | while(1) {//reading an access point 215 | int l = read_int(buffer, buf_pt); 216 | int x = read_int(buffer, buf_pt); 217 | int y = read_int(buffer, buf_pt); 218 | minx = min(minx, x); 219 | maxx = max(maxx, x); 220 | miny = min(miny, y); 221 | maxy = max(maxy, y); 222 | 223 | access_points[pin_id][++access_points[pin_id][0]] = l * X * Y + x * Y + y; 224 | assert(access_points[pin_id][0] < 20); 225 | bool pin_end = false; 226 | while(!isdigit(buffer[buf_pt])) 227 | if(buffer[buf_pt++] == ']') { pin_end = true; break; } 228 | if(pin_end) { 229 | pin_id++; 230 | break; 231 | } 232 | } 233 | bool net_end = false; 234 | while(!isdigit(buffer[buf_pt])) 235 | if(buffer[buf_pt++] == ')') { net_end = true; break; } 236 | if(net_end) { 237 | nets.back().init(pin_id, access_points, minx, maxx, miny, maxy); 238 | break; 239 | } 240 | } 241 | while(buffer[buf_pt] == ' ' || buffer[buf_pt] == '\n') buf_pt++; 242 | if(buf_pt == fsize) break; 243 | } 244 | 245 | printf("[%5.1f] read net file done: duration=%.2fs", elapsed_time(), elapsed_time() - net_start_time); 246 | cout << endl; 247 | 248 | }; 249 | 250 | read_cap(cap_file_name); 251 | read_net(net_file_name); 252 | 253 | input_time = elapsed_time() - input_start_time; 254 | } 255 | 256 | } -------------------------------------------------------------------------------- /src/Lshape_route.hpp: -------------------------------------------------------------------------------- 1 | #include "graph.hpp" 2 | 3 | namespace Lshape_route { 4 | 5 | //declaration 6 | void Lshape_route(vector &nets2route); 7 | 8 | //implementation 9 | 10 | __global__ void Lshape_route_cuda(int net_cnt, int net_offset, int *node_cnt_sum, int *nodes, int *par_nodes, double *dist, int *from, int *layer_range, int stamp) { 11 | int net_idx = blockIdx.x * blockDim.x + threadIdx.x;// net_idx-th net in this batch 12 | if(net_idx >= net_cnt) return; 13 | net_idx += net_offset; 14 | int node_cnt = node_cnt_sum[net_idx + 1] - node_cnt_sum[net_idx];// node count of the net 15 | int *net_routes = routes + pin_acc_num[net_ids[net_idx]] * ROUTE_PER_PIN; 16 | nodes += node_cnt_sum[net_idx]; 17 | par_nodes += node_cnt_sum[net_idx]; 18 | layer_range += node_cnt_sum[net_idx]; 19 | dist += (node_cnt_sum[net_idx] - node_cnt_sum[net_offset]) * L * L; 20 | from += (node_cnt_sum[net_idx] - node_cnt_sum[net_offset]) * L * L; 21 | //compute the via cost for each node and each layer range pair 22 | // node_dist[minl * L + maxl] is the total via cost to include layers in [minl, maxl] 23 | for(int i = 0; i < node_cnt; i++) { 24 | int l = nodes[i] / X / Y, x = nodes[i] / Y % X, y = nodes[i] % Y; 25 | double *node_dist = dist + i * L * L; 26 | for(int minl = 0; minl < L; minl++) { 27 | node_dist[minl * L + minl] = 0; 28 | for(int maxl = minl + 1; maxl < L; maxl++) 29 | node_dist[minl * L + maxl] = node_dist[minl * L + maxl - 1] + vcost[IDX(maxl - 1, x, y)]; 30 | for(int maxl = minl; maxl < L; maxl++) 31 | if(l < L && (minl > l || maxl < l)) node_dist[minl * L + maxl] = INF; 32 | } 33 | } 34 | for(int i = node_cnt - 1; i >= 1; i--) { 35 | int x = nodes[i] / Y % X, y = nodes[i] % Y;// current node 36 | int px = nodes[par_nodes[i]] / Y % X, py = nodes[par_nodes[i]] % Y;// parent node of current node 37 | int minx = min(x, px), maxx = max(x, px), miny = min(y, py), maxy = max(y, py); 38 | double *node_dist = dist + i * L * L, *par_dist = dist + par_nodes[i] * L * L; 39 | assert(par_nodes[i] < i); 40 | int *prev = from + i * L * L, cur_from[10]; 41 | double min_cost_cur[10], min_cost_par[100]; 42 | for(int l = 0; l < L; l++) { 43 | min_cost_cur[l] = min_cost_par[l * L + l] = INF; 44 | for(int minl = 0; minl <= l; minl++) 45 | for(int maxl = l; maxl < L; maxl++) 46 | if(node_dist[minl * L + maxl] < min_cost_cur[l]) { 47 | min_cost_cur[l] = node_dist[minl * L + maxl]; 48 | cur_from[l] = minl * L + maxl; 49 | } 50 | } 51 | if(x == px || y == py) { 52 | for(int l = 0; l < L; l++) { 53 | if((l & 1 ^ DIR) == 0 && y != py) continue; 54 | if((l & 1 ^ DIR) == 1 && x != px) continue; 55 | min_cost_par[l * L + l] = min_cost_cur[l] + graph::wire_segment_cost(l, minx, maxx, miny, maxy);//presum[IDX(l, maxx, maxy)] - presum[IDX(l, minx, miny)]; 56 | prev[l * L + l] = cur_from[l] * L * L + l * L + l; 57 | } 58 | } else { 59 | for(int curl = 0; curl < L; curl++) { 60 | double cost = min_cost_cur[curl]; 61 | if(curl & 1 ^ DIR) 62 | cost += graph::wire_segment_cost(curl, x, x, miny, maxy);//presum[IDX(curl, x, maxy)] - presum[IDX(curl, x, miny)]; 63 | else 64 | cost += graph::wire_segment_cost(curl, minx, maxx, y, y);//presum[IDX(curl, maxx, y)] - presum[IDX(curl, minx, y)]; 65 | for(int parl = curl & 1 ^ 1; parl < L; parl += 2) {// curl and parl must have different routing directions 66 | assert(curl % 2 != parl % 2); 67 | double cost2 = 0; 68 | for(int l = min(curl, parl) + 1; l < max(curl, parl); l++) 69 | cost2 += (curl & 1 ^ DIR) ? vcost[IDX(l, x, py)] : vcost[IDX(l, px, y)]; 70 | if(parl & 1 ^ DIR) 71 | cost2 += graph::wire_segment_cost(parl, px, px, miny, maxy);//presum[IDX(parl, px, maxy)] - presum[IDX(parl, px, miny)]; 72 | else 73 | cost2 += graph::wire_segment_cost(parl, minx, maxx, py, py);//presum[IDX(parl, maxx, py)] - presum[IDX(parl, minx, py)]; 74 | if(cost + cost2 < min_cost_par[parl * L + parl]) { 75 | min_cost_par[parl * L + parl] = cost + cost2; 76 | prev[parl * L + parl] = cur_from[curl] * L * L + curl * L + parl; 77 | } 78 | } 79 | } 80 | } 81 | 82 | 83 | for(int minl = 0; minl < L; minl++) { 84 | for(int maxl = minl + 1; maxl < L; maxl++) { 85 | //min_cost_par[minl * L + maxl] = min { min_cost_par[minl * L + maxl - 1], min_cost_par[maxl * L + maxl] } 86 | if(min_cost_par[minl * L + maxl - 1] <= min_cost_par[maxl * L + maxl]) { 87 | min_cost_par[minl * L + maxl] = min_cost_par[minl * L + maxl - 1]; 88 | prev[minl * L + maxl] = prev[minl * L + maxl - 1]; 89 | } else { 90 | min_cost_par[minl * L + maxl] = min_cost_par[maxl * L + maxl]; 91 | prev[minl * L + maxl] = prev[maxl * L + maxl]; 92 | } 93 | } 94 | for(int maxl = minl; maxl < L; maxl++) { 95 | par_dist[minl * L + maxl] += min_cost_par[minl * L + maxl]; 96 | } 97 | } 98 | } 99 | 100 | net_routes[0] = 1; 101 | layer_range[0] = 0; 102 | for(int minl = 0; minl < L; minl++) 103 | for(int maxl = minl; maxl < L; maxl++) 104 | if(dist[minl * L + maxl] < dist[layer_range[0]]) { 105 | layer_range[0] = minl * L + maxl; 106 | } 107 | for(int i = 0; i < node_cnt; i++) { 108 | int *prev = from + i * L * L; 109 | if(i > 0) layer_range[i] = prev[layer_range[par_nodes[i]]] / L / L; 110 | assert(dist[i * L * L + layer_range[i]] < INF); 111 | int minl = layer_range[i] / L, maxl = layer_range[i] % L, x = nodes[i] / Y % X, y = nodes[i] % Y; 112 | if(minl < maxl) { 113 | net_routes[net_routes[0]++] = IDX(minl, x, y); 114 | net_routes[net_routes[0]++] = IDX(maxl, x, y); 115 | } 116 | 117 | if(i == 0) continue; 118 | int px = nodes[par_nodes[i]] / Y % X, py = nodes[par_nodes[i]] % Y; 119 | int minx = min(x, px), maxx = max(x, px), miny = min(y, py), maxy = max(y, py); 120 | int curl = prev[layer_range[par_nodes[i]]] / L % L, parl = prev[layer_range[par_nodes[i]]] % L; 121 | if(px == x || py == y) { 122 | assert(curl == parl); 123 | net_routes[net_routes[0]++] = IDX(curl, minx, miny); 124 | net_routes[net_routes[0]++] = IDX(curl, maxx, maxy); 125 | graph::atomic_add_unit_demand_wire_segment(curl, minx, maxx, miny, maxy, stamp); 126 | } else { 127 | assert(curl % 2 != parl % 2); 128 | int minl = min(curl, parl), maxl = max(curl, parl); 129 | if(curl & 1 ^ DIR) { 130 | net_routes[net_routes[0]++] = IDX(curl, x, y); 131 | net_routes[net_routes[0]++] = IDX(curl, x, py); 132 | graph::atomic_add_unit_demand_wire_segment(curl, x, x, miny, maxy, stamp); 133 | 134 | net_routes[net_routes[0]++] = IDX(curl, x, py); 135 | net_routes[net_routes[0]++] = IDX(parl, x, py); 136 | 137 | net_routes[net_routes[0]++] = IDX(parl, x, py); 138 | net_routes[net_routes[0]++] = IDX(parl, px, py); 139 | graph::atomic_add_unit_demand_wire_segment(parl, minx, maxx, py, py, stamp); 140 | } else { 141 | net_routes[net_routes[0]++] = IDX(curl, x, y); 142 | net_routes[net_routes[0]++] = IDX(curl, px, y); 143 | graph::atomic_add_unit_demand_wire_segment(curl, minx, maxx, y, y, stamp); 144 | 145 | net_routes[net_routes[0]++] = IDX(curl, px, y); 146 | net_routes[net_routes[0]++] = IDX(parl, px, y); 147 | 148 | net_routes[net_routes[0]++] = IDX(parl, px, y); 149 | net_routes[net_routes[0]++] = IDX(parl, px, py); 150 | graph::atomic_add_unit_demand_wire_segment(parl, px, px, miny, maxy, stamp); 151 | } 152 | } 153 | } 154 | } 155 | 156 | void Lshape_route(vector &nets2route) { 157 | double Lshape_start_time = elapsed_time(); 158 | 159 | 160 | if(LOG) printf("[%5.1f] FLUTE", elapsed_time()); cerr << endl; 161 | #pragma omp parallel for num_threads(8) 162 | for(int i = 0; i < nets2route.size(); i++) { 163 | nets[nets2route[i]].construct_rsmt(); 164 | } 165 | if(LOG) printf("[%5.1f] FLUTE END", elapsed_time()); cerr << endl; 166 | 167 | 168 | //generate batches 169 | 170 | if(LOG) printf("[%5.1f] SORT\n", elapsed_time()); 171 | sort(nets2route.begin(), nets2route.end(), [] (int l, int r) { 172 | return nets[l].hpwl > nets[r].hpwl; 173 | }); 174 | if(LOG) printf("[%5.1f] SORT END\n", elapsed_time()); 175 | 176 | auto batches = generate_batches_rsmt(nets2route); 177 | reverse(batches.begin(), batches.end()); 178 | 179 | vector batch_cnt_sum(batches.size() + 1, 0); 180 | for(int i = 0; i < batches.size(); i++) { 181 | batch_cnt_sum[i + 1] = batch_cnt_sum[i] + batches[i].size(); 182 | for(int j = 0; j < batches[i].size(); j++) 183 | nets2route[batch_cnt_sum[i] + j] = batches[i][j]; 184 | } 185 | int pin_cnt = 0, net_cnt = nets2route.size(); 186 | for(auto net_id : nets2route) pin_cnt += nets[net_id].pins.size(); 187 | int *node_cnt_sum_cpu = new int[net_cnt + 1](); 188 | int *nodes_cpu = new int[pin_cnt * 2]; 189 | int *par_nodes_cpu = new int[pin_cnt * 2]; 190 | 191 | 192 | if(LOG) printf("[%5.1f] DFS\n", elapsed_time()); 193 | for(int id = 0; id < net_cnt; id++) { 194 | auto &graph = nets[nets2route[id]].rsmt; 195 | function dfs = [&] (int x, int par, int par_node_idx) { 196 | int node_idx = node_cnt_sum_cpu[id + 1]++; 197 | nodes_cpu[node_cnt_sum_cpu[id] + node_idx] = graph.back()[x]; 198 | par_nodes_cpu[node_cnt_sum_cpu[id] + node_idx] = par_node_idx; 199 | for(auto e : graph[x]) if(e != par) dfs(e, x, node_idx); 200 | }; 201 | dfs(0, -1, -1); 202 | node_cnt_sum_cpu[id + 1] += node_cnt_sum_cpu[id]; 203 | } 204 | if(LOG) printf("[%5.1f] DFS END\n", elapsed_time()); 205 | int max_num_nodes = 0; 206 | for(int i = 0; i < batches.size(); i++) 207 | max_num_nodes = max(max_num_nodes, node_cnt_sum_cpu[batch_cnt_sum[i + 1]] - node_cnt_sum_cpu[batch_cnt_sum[i]]); 208 | 209 | double *dist; 210 | int *from, *layer_range, *node_cnt_sum, *nodes, *par_nodes; 211 | 212 | print_GPU_memory_usage(); 213 | cudaMemcpy(net_ids, nets2route.data(), net_cnt * sizeof(int), cudaMemcpyHostToDevice); 214 | 215 | cudaMalloc(&node_cnt_sum, (net_cnt + 1) * sizeof(int)); 216 | cudaMemcpy(node_cnt_sum, node_cnt_sum_cpu, (net_cnt + 1) * sizeof(int), cudaMemcpyHostToDevice); 217 | cudaMalloc(&nodes, node_cnt_sum_cpu[net_cnt] * sizeof(int)); 218 | cudaMemcpy(nodes, nodes_cpu, node_cnt_sum_cpu[net_cnt] * sizeof(int), cudaMemcpyHostToDevice); 219 | cudaMalloc(&par_nodes, node_cnt_sum_cpu[net_cnt] * sizeof(int)); 220 | cudaMemcpy(par_nodes, par_nodes_cpu, node_cnt_sum_cpu[net_cnt] * sizeof(int), cudaMemcpyHostToDevice); 221 | 222 | print_GPU_memory_usage(); 223 | cudaMalloc(&dist, max_num_nodes * L * L * sizeof(double)); 224 | cudaMalloc(&from, max_num_nodes * L * L * sizeof(int)); 225 | cudaMalloc(&layer_range, node_cnt_sum_cpu[net_cnt] * sizeof(int)); 226 | 227 | print_GPU_memory_usage(); 228 | 229 | if(LOG) printf("[%5.1f] Lshape_cuda\n", elapsed_time()); 230 | for(int i = 0; i < batches.size(); i++) { 231 | global_timestamp++; 232 | graph::update_cost(); 233 | graph::compute_presum<<>> (); 234 | Lshape_route_cuda<<>> (batches[i].size(), batch_cnt_sum[i], node_cnt_sum, nodes, par_nodes, dist, from, layer_range, global_timestamp); 235 | graph::batch_wire_update(global_timestamp); 236 | graph::commit_via_demand<<>> (batches[i].size(), batch_cnt_sum[i], global_timestamp); 237 | } 238 | cudaDeviceSynchronize(); 239 | if(LOG) printf("[%5.1f] Lshape_cuda END\n", elapsed_time()); 240 | 241 | cudaFree(node_cnt_sum); 242 | cudaFree(nodes); 243 | cudaFree(par_nodes); 244 | cudaFree(dist); 245 | cudaFree(from); 246 | cudaFree(layer_range); 247 | 248 | printf("Lshape END. "); 249 | print_GPU_memory_usage(); 250 | 251 | Lshape_time = elapsed_time() - Lshape_start_time; 252 | } 253 | 254 | } -------------------------------------------------------------------------------- /src/graph.hpp: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | #include "global.h" 4 | #include 5 | #include 6 | #include "database_cuda.hpp" 7 | 8 | namespace graph { 9 | 10 | //declaration 11 | void update_cost(); 12 | void output(char file_name[]); 13 | __global__ void compute_presum(); 14 | 15 | //implementation 16 | 17 | __device__ double wire_segment_cost(int layer, int xmin, int xmax, int ymin, int ymax) { 18 | return presum[IDX(layer, xmax, ymax)] - presum[IDX(layer, xmin, ymin)]; 19 | } 20 | 21 | __device__ double of_cost_scaled(float capacity, float demand) { 22 | if(capacity > 0.001) return __expf(min(0.5 * (demand - capacity), of_cost_scale * 0.5 * (demand - capacity))); 23 | if(demand > 0) return __expf(min(1.5 * demand, of_cost_scale * 1.5 * demand)); 24 | return 0; 25 | } 26 | __device__ double of_cost(float capacity, float demand) { 27 | if(capacity > 0.001) return __expf(0.5 * (demand - capacity)); 28 | if(demand > 0) return __expf(1.5 * demand); 29 | return 0; 30 | } 31 | 32 | __device__ double incremental_of_cost(int l, int x, int y, double incre_demand) { 33 | int idx = l * X * Y + x * Y + y; 34 | return min(1e12, (of_cost_scaled(capacity[idx], demand[idx] + incre_demand) - of_cost_scaled(capacity[idx], demand[idx])) * unit_length_short_costs[l]); 35 | } 36 | 37 | __global__ void update_wcost_cuda_ispd24() { 38 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 39 | if(idx >= L * X * Y) return; 40 | int l = idx / X / Y, x = idx / Y % X, y = idx % Y, dir = l & 1 ^ DIR; 41 | wcost[idx] = incremental_of_cost(l, x, y, 1); 42 | if(dir == 0 && x < X - 1) wcost[idx] += unit_length_wire_cost * x_edge_len[x]; 43 | if(dir == 1 && y < Y - 1) wcost[idx] += unit_length_wire_cost * y_edge_len[y]; 44 | } 45 | 46 | 47 | 48 | __device__ const double via_weight = 1; 49 | __device__ void update_single_vcost_ispd24(int l, int x, int y) { 50 | int idx = IDX(l, x, y); 51 | if(l & 1 ^ DIR) { 52 | if(y == 0) 53 | vcost[idx] = incremental_of_cost(l, x, y, 1); 54 | else if(y == Y - 1) 55 | vcost[idx] = incremental_of_cost(l, x, y - 1, 1); 56 | else 57 | vcost[idx] = incremental_of_cost(l, x, y, 0.5) + incremental_of_cost(l, x, y - 1, 0.5); 58 | } else { 59 | if(x == 0) 60 | vcost[idx] = incremental_of_cost(l, x, y, 1); 61 | else if(x == X - 1) 62 | vcost[idx] = incremental_of_cost(l, x - 1, y, 1); 63 | else 64 | vcost[idx] = incremental_of_cost(l, x, y, 0.5) + incremental_of_cost(l, x - 1, y, 0.5); 65 | } 66 | vcost[idx] += unit_via_cost; 67 | } 68 | 69 | 70 | __global__ void compute_presum_general(int *to_sum) { 71 | extern __shared__ int sum2[]; 72 | if(threadIdx.x == 0) sum2[0] = 0; 73 | int l = idx2track[blockIdx.x] / XY; 74 | if(l & 1 ^ DIR) { 75 | int x = idx2track[blockIdx.x] % XY; 76 | for(int y = threadIdx.x; y < Y - 1; y += blockDim.x) sum2[y + 1] = to_sum[IDX(l, x, y)]; 77 | __syncthreads(); 78 | for(int d = 0; (1 << d) < Y; d++) { 79 | for(int idx = threadIdx.x; idx < Y; idx += blockDim.x) 80 | if(idx >> d & 1) sum2[idx] += sum2[(idx >> d << d) - 1]; 81 | __syncthreads(); 82 | } 83 | for(int y = threadIdx.x; y < Y; y += blockDim.x) to_sum[IDX(l, x, y)] = sum2[y]; 84 | } else { 85 | int y = idx2track[blockIdx.x] % XY; 86 | for(int x = threadIdx.x; x < X - 1; x += blockDim.x) sum2[x + 1] = to_sum[IDX(l, x, y)]; 87 | __syncthreads(); 88 | for(int d = 0; (1 << d) < X; d++) { 89 | for(int idx = threadIdx.x; idx < X; idx += blockDim.x) 90 | if(idx >> d & 1) sum2[idx] += sum2[(idx >> d << d) - 1]; 91 | __syncthreads(); 92 | } 93 | for(int x = threadIdx.x; x < X; x += blockDim.x) to_sum[IDX(l, x, y)] = sum2[x]; 94 | } 95 | } 96 | 97 | __global__ void update_vcost_ispd24() { 98 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 99 | if(idx < L * X * Y) update_single_vcost_ispd24(idx / X / Y, idx / Y % X, idx % Y); 100 | } 101 | __device__ void atomic_add_unit_demand_wire_segment(int l, int minx, int maxx, int miny, int maxy, int stamp, int K = 1) { 102 | assert(minx == maxx || miny == maxy); 103 | 104 | atomicAdd(pre_demand + IDX(l, minx, miny), K); 105 | if(minx == maxx) { 106 | atomicAdd(pre_demand + IDX(l, minx, maxy), -K); 107 | timestamp[IDX(l, minx, maxy)] = stamp; 108 | } 109 | if(miny == maxy) { 110 | atomicAdd(pre_demand + IDX(l, maxx, miny), -K); 111 | timestamp[IDX(l, maxx, miny)] = stamp; 112 | } 113 | } 114 | __global__ void commit_all_edge(int stamp) { 115 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 116 | if(idx >= L * X * Y) return; 117 | int l = idx / X / Y, x = idx / Y % X, y = idx % Y; 118 | if((l & 1 ^ DIR) == 0 && x + 1 < X && pre_demand[IDX(l, x + 1, y)] > 0) { 119 | demand[IDX(l, x, y)] += pre_demand[IDX(l, x + 1, y)]; 120 | timestamp[IDX(l, x, y)] = stamp; 121 | atomicAdd(&total_wirelength, pre_demand[IDX(l, x + 1, y)] * x_edge_len[x]); 122 | } 123 | if((l & 1 ^ DIR) == 1 && y + 1 < Y && pre_demand[IDX(l, x, y + 1)] > 0) { 124 | demand[IDX(l, x, y)] += pre_demand[IDX(l, x, y + 1)]; 125 | timestamp[IDX(l, x, y)] = stamp; 126 | atomicAdd(&total_wirelength, pre_demand[IDX(l, x, y + 1)] * y_edge_len[y]); 127 | } 128 | } 129 | void batch_wire_update(int stamp) { 130 | compute_presum_general<<>> (pre_demand); 131 | commit_all_edge<<>> (stamp); 132 | cudaMemset(pre_demand, 0, sizeof(int) * L * X * Y); 133 | } 134 | 135 | 136 | void update_cost_ispd24() { 137 | update_wcost_cuda_ispd24<<>> (); 138 | update_vcost_ispd24<<>> (); 139 | } 140 | void update_cost() { 141 | update_cost_ispd24(); 142 | } 143 | 144 | __global__ void add_all_overflow_cost() { 145 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 146 | if(idx >= L * X * Y) return; 147 | int l = idx / X / Y, x = idx / Y % X, y = idx % Y; 148 | if(((l & 1 ^ DIR) == 0 && x + 1 < X) || ((l & 1 ^ DIR) == 1 && y + 1 < Y)) { 149 | double c = unit_length_short_costs[l] * of_cost(capacity[idx], demand[idx]); 150 | atomicAdd(&total_overflow_cost, c); 151 | atomicAdd(&layer_overflow_cost[l], c); 152 | } 153 | } 154 | 155 | void report_score() { 156 | printf("\n----------------------------------------------------------------------\n"); 157 | total_overflow_cost = 0; 158 | for(int i = 0; i < L; i++) layer_overflow_cost[i] = 0; 159 | add_all_overflow_cost<<>> (); 160 | cudaDeviceSynchronize(); 161 | printf(" WL Via OF Total\n%15.0f%15.0f%20.0f%50.0f\n", 162 | total_wirelength * db::unit_length_wire_cost, total_via_count * db::unit_via_cost, total_overflow_cost, 163 | total_wirelength * db::unit_length_wire_cost + total_via_count * db::unit_via_cost + total_overflow_cost); 164 | printf("----------------------------------------------------------------------\n\n"); 165 | } 166 | 167 | void finish_nets(vector &finished_nets) { 168 | for(auto _net_id : finished_nets) { 169 | int _ = nets[_net_id].original_net_id; 170 | if(--db::nets[_].unfinished_subnet_count == 0) nets2output.push(_); 171 | 172 | } 173 | } 174 | 175 | void output_nets() { 176 | auto write_int = [&] (int num) { 177 | if(num == 0) { 178 | putc_unlocked('0', out_file); 179 | } else { 180 | static char temp[20]; 181 | int len = 0; 182 | while(num) temp[len++] = num % 10, num /= 10; 183 | for(int i = len - 1; i >= 0; i--) putc_unlocked('0' + temp[i], out_file); 184 | } 185 | }; 186 | auto write_single_route = [&] (int x0, int y0, int l0, int x1, int y1, int l1) { 187 | write_int(x0); 188 | putc_unlocked(' ', out_file); 189 | write_int(y0); 190 | putc_unlocked(' ', out_file); 191 | write_int(l0); 192 | putc_unlocked(' ', out_file); 193 | write_int(x1); 194 | putc_unlocked(' ', out_file); 195 | write_int(y1); 196 | putc_unlocked(' ', out_file); 197 | write_int(l1); 198 | putc_unlocked('\n', out_file); 199 | }; 200 | auto write_route = [&] (int pos0, int pos1) { 201 | if(pos0 > pos1) swap(pos0, pos1); 202 | write_single_route(pos0 / Y % X, pos0 % Y, pos0 / X / Y, pos1 / Y % X, pos1 % Y, pos1 / X / Y); 203 | }; 204 | 205 | while(!nets2output.empty()) { 206 | int _ = nets2output.front(); 207 | nets2output.pop(); 208 | for(auto e : db::nets[_].name) putc_unlocked(e, out_file); 209 | putc_unlocked('\n', out_file); 210 | putc_unlocked('(', out_file); 211 | putc_unlocked('\n', out_file); 212 | for(auto pin : db::nets[_].pins) if(pin < X * Y) write_route(pin, pin + X * Y); 213 | for(int i = 0; i < db::nets[_].extra_routes.size(); i += 2) 214 | write_route(db::nets[_].extra_routes[i], db::nets[_].extra_routes[i + 1]); 215 | for(auto net_id : db::nets[_].subnets) { 216 | int *net_routes = routes + pin_cnt_sum_cpu[net_id] * ROUTE_PER_PIN; 217 | for(int j = 1; j < net_routes[0]; j += 2) write_route(net_routes[j] + X * Y, net_routes[j + 1] + X * Y); 218 | } 219 | putc_unlocked(')', out_file); 220 | putc_unlocked('\n', out_file); 221 | } 222 | } 223 | 224 | void output(char file_name[]) { 225 | double output_start_time = elapsed_time(); 226 | FILE *file = fopen(file_name, "w"); 227 | int char_cur = 0; 228 | const int CHAR_COUNT = 1000000000; 229 | static char out[CHAR_COUNT]; 230 | 231 | int write_count = 0; 232 | vector is_pin(X * Y, -1); 233 | static int *routes_cpu = new int[PIN_NUM * ROUTE_PER_PIN]; 234 | cudaMemcpy(routes_cpu, routes, sizeof(int) * PIN_NUM * ROUTE_PER_PIN, cudaMemcpyDeviceToHost); 235 | 236 | auto write_int = [&] (int num) { 237 | if(num == 0) 238 | out[char_cur++] = '0'; 239 | else { 240 | static char temp[20]; 241 | int len = 0; 242 | while(num) temp[len++] = num % 10, num /= 10; 243 | for(int i = len - 1; i >= 0; i--) out[char_cur++] = temp[i] + '0'; 244 | } 245 | }; 246 | auto write_single_route = [&] (int x0, int y0, int l0, int x1, int y1, int l1) { 247 | write_int(x0); 248 | out[char_cur++] = ' '; 249 | write_int(y0); 250 | out[char_cur++] = ' '; 251 | write_int(l0); 252 | out[char_cur++] = ' '; 253 | write_int(x1); 254 | out[char_cur++] = ' '; 255 | write_int(y1); 256 | out[char_cur++] = ' '; 257 | write_int(l1); 258 | out[char_cur++] = '\n'; 259 | }; 260 | auto write_route = [&] (int pos0, int pos1) { 261 | if(pos0 > pos1) swap(pos0, pos1); 262 | write_single_route(pos0 / Y % X, pos0 % Y, pos0 / X / Y, pos1 / Y % X, pos1 % Y, pos1 / X / Y); 263 | }; 264 | 265 | for(int _ = 0; _ < db::nets.size(); _++) { 266 | if(char_cur * 1.1 > CHAR_COUNT) { 267 | fwrite(out, sizeof(char), char_cur, file), char_cur = 0; 268 | write_count++; 269 | } 270 | for(auto e : db::nets[_].name) out[char_cur++] = e; 271 | out[char_cur++] = '\n'; 272 | out[char_cur++] = '('; 273 | out[char_cur++] = '\n'; 274 | for(auto pin : db::nets[_].pins) if(pin < X * Y) is_pin[pin] = _; 275 | for(int i = 0; i < db::nets[_].extra_routes.size(); i += 2) 276 | write_route(db::nets[_].extra_routes[i], db::nets[_].extra_routes[i + 1]); 277 | for(auto net_id : db::nets[_].subnets) { 278 | auto &net = nets[net_id]; 279 | int *net_routes = routes_cpu + pin_cnt_sum_cpu[net_id] * ROUTE_PER_PIN; 280 | for(int j = 1; j < net_routes[0]; j += 2) { 281 | int pos0 = min(net_routes[j], net_routes[j + 1]), pos1 = max(net_routes[j], net_routes[j + 1]); 282 | int l0 = pos0 / X / Y + 1, x0 = pos0 / Y % X, y0 = pos0 % Y; 283 | int l1 = pos1 / X / Y + 1, x1 = pos1 / Y % X, y1 = pos1 % Y; 284 | if(l0 == 1 && l0 < l1 && is_pin[x0 * Y + y0] == _) l0 = 0, is_pin[x0 * Y + y0] = -1; 285 | write_single_route(x0, y0, l0, x1, y1, l1); 286 | } 287 | } 288 | for(auto pin : db::nets[_].pins) 289 | if(pin < X * Y && is_pin[pin] == _) write_route(pin, pin + X * Y); 290 | out[char_cur++] = ')'; 291 | out[char_cur++] = '\n'; 292 | } 293 | 294 | double fwrite_start_time = elapsed_time(); 295 | fwrite(out, sizeof(char), char_cur, file), char_cur = 0; 296 | //write(file, out, char_cur); 297 | fclose(file); 298 | printf(" write calls: %d\n", ++write_count); 299 | printf(" fwrite time: %.2f\n", elapsed_time() - fwrite_start_time); 300 | output_time = elapsed_time() - output_start_time; 301 | } 302 | 303 | __global__ void compute_presum() { 304 | extern __shared__ double sum[]; 305 | if(threadIdx.x == 0) sum[0] = 0; 306 | int l = idx2track[blockIdx.x] / XY; 307 | if(l & 1 ^ DIR) { 308 | int x = idx2track[blockIdx.x] % XY; 309 | for(int y = threadIdx.x; y < Y - 1; y += blockDim.x) sum[y + 1] = wcost[l * X * Y + x * Y + y]; 310 | __syncthreads(); 311 | for(int d = 0; (1 << d) < Y; d++) { 312 | for(int idx = threadIdx.x; idx < Y; idx += blockDim.x) 313 | if(idx >> d & 1) sum[idx] += sum[(idx >> d << d) - 1]; 314 | __syncthreads(); 315 | } 316 | for(int y = threadIdx.x; y < Y; y += blockDim.x) presum[l * X * Y + x * Y + y] = sum[y]; 317 | } else { 318 | int y = idx2track[blockIdx.x] % XY; 319 | for(int x = threadIdx.x; x < X - 1; x += blockDim.x) sum[x + 1] = wcost[l * X * Y + x * Y + y]; 320 | __syncthreads(); 321 | for(int d = 0; (1 << d) < X; d++) { 322 | for(int idx = threadIdx.x; idx < X; idx += blockDim.x) 323 | if(idx >> d & 1) sum[idx] += sum[(idx >> d << d) - 1]; 324 | __syncthreads(); 325 | } 326 | for(int x = threadIdx.x; x < X; x += blockDim.x) presum[l * X * Y + x * Y + y] = sum[x]; 327 | } 328 | } 329 | 330 | __global__ void mark_overflow_edges(int threshold) { 331 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 332 | if(idx >= L * X * Y) return; 333 | int l = idx / X / Y, x = idx / Y % X, y = idx % Y; 334 | if(((l & 1 ^ DIR) == 0 && x + 1 < X) || ((l & 1 ^ DIR) == 1 && y + 1 < Y)) 335 | of_edge_sum[idx] = (capacity[idx] + threshold <= demand[idx] ? 1 : 0); 336 | } 337 | 338 | __global__ void mark_overflow_nets() { 339 | int net_id = blockIdx.x * blockDim.x + threadIdx.x; 340 | if(net_id >= NET_NUM) return; 341 | is_of_net[net_id] = false; 342 | int *net_routes = routes + pin_acc_num[net_id] * ROUTE_PER_PIN, of_count = 0, pin_cnt = pin_acc_num[net_id + 1] - pin_acc_num[net_id]; 343 | for(int i = 1; i < net_routes[0]; i += 2) { 344 | int l = net_routes[i] / X / Y, x0 = net_routes[i] / Y % X, y0 = net_routes[i] % Y; 345 | int x1 = net_routes[i + 1] / Y % X, y1 = net_routes[i + 1] % Y; 346 | if(x0 != x1) of_count += of_edge_sum[IDX(l, max(x0, x1), y0)] - of_edge_sum[IDX(l, min(x0, x1), y0)]; 347 | if(y0 != y1) of_count += of_edge_sum[IDX(l, x0, max(y0, y1))] - of_edge_sum[IDX(l, x0, min(y0, y1))]; 348 | if(of_count > 12) { 349 | is_of_net[net_id] = true; 350 | break; 351 | } 352 | } 353 | } 354 | 355 | __global__ void commit_wire_demand(int net_cnt, int net_offset, int stamp, int K = 1) { 356 | int net_id = blockIdx.x * blockDim.x + threadIdx.x; 357 | if(net_id >= net_cnt) return; 358 | net_id = net_ids[net_id + net_offset]; 359 | int *net_routes = routes + pin_acc_num[net_id] * ROUTE_PER_PIN; 360 | double wirelength = 0; 361 | //commit wires 362 | for(int i = 1; i < net_routes[0]; i += 2) if(net_routes[i] / X / Y == net_routes[i + 1] / X / Y) { 363 | int l = net_routes[i] / X / Y; 364 | int x0 = net_routes[i] / Y % X, y0 = net_routes[i] % Y; 365 | int x1 = net_routes[i + 1] / Y % X, y1 = net_routes[i + 1] % Y; 366 | if(x0 < x1) for(int x = x0; x < x1; x++) atomicAdd(demand + IDX(l, x, y0), K), wirelength += x_edge_len[x], timestamp[IDX(l, x, y0)] = stamp; 367 | if(x1 < x0) for(int x = x1; x < x0; x++) atomicAdd(demand + IDX(l, x, y0), K), wirelength += x_edge_len[x], timestamp[IDX(l, x, y0)] = stamp; 368 | if(y0 < y1) for(int y = y0; y < y1; y++) atomicAdd(demand + IDX(l, x0, y), K), wirelength += y_edge_len[y], timestamp[IDX(l, x0, y)] = stamp; 369 | if(y1 < y0) for(int y = y1; y < y0; y++) atomicAdd(demand + IDX(l, x0, y), K), wirelength += y_edge_len[y], timestamp[IDX(l, x0, y)] = stamp; 370 | timestamp[IDX(l, x0, y0)] = timestamp[IDX(l, x0, y1)] = timestamp[IDX(l, x1, y0)] = timestamp[IDX(l, x1, y1)] = stamp; 371 | } 372 | atomicAdd(&total_wirelength, wirelength * K); 373 | } 374 | 375 | __global__ void commit_via_demand(int net_cnt, int net_offset, int stamp, int K = 1) { 376 | int net_id = blockIdx.x * blockDim.x + threadIdx.x; 377 | if(net_id >= net_cnt) return; 378 | net_id = net_ids[net_id + net_offset]; 379 | int *net_routes = routes + pin_acc_num[net_id] * ROUTE_PER_PIN, via_count = 0; 380 | //commit vias 381 | for(int i = 1; i < net_routes[0]; i += 2) if(net_routes[i] / X / Y != net_routes[i + 1] / X / Y) { 382 | int x = net_routes[i] / Y % X, y = net_routes[i] % Y, l0 = net_routes[i] / X / Y, l1 = net_routes[i + 1] / X / Y; 383 | int minl = min(l0, l1), maxl = max(l0, l1); 384 | via_count += maxl - minl; 385 | for(int l = minl; l < maxl; l++) if(timestamp[IDX(l, x, y)] < stamp) { 386 | if(l & 1 ^ DIR) { 387 | if(y == 0) 388 | atomicAdd(demand + IDX(l, x, y), K); 389 | else if(y == Y - 1) 390 | atomicAdd(demand + IDX(l, x, y - 1), K); 391 | else { 392 | atomicAdd(demand + IDX(l, x, y - 1), 0.5 * K); 393 | atomicAdd(demand + IDX(l, x, y), 0.5 * K); 394 | } 395 | } else { 396 | if(x == 0) { 397 | atomicAdd(demand + IDX(l, x, y), K); 398 | } 399 | else if(x == X - 1) 400 | atomicAdd(demand + IDX(l, x - 1, y), K); 401 | else { 402 | atomicAdd(demand + IDX(l, x - 1, y), 0.5 * K); 403 | atomicAdd(demand + IDX(l, x, y), 0.5 * K); 404 | } 405 | } 406 | //timestamp[IDX(l, x, y)] = stamp; 407 | } 408 | } 409 | atomicAdd(&total_via_count, via_count * K); 410 | } 411 | 412 | __global__ void extract_congestionView() { 413 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 414 | if(idx >= L * X * Y) return; 415 | int x = idx / Y % X, y = idx % Y; 416 | double resource = capacity[idx] - demand[idx]; 417 | if(resource < 0) 418 | { 419 | congestion[x * Y + y] = true; 420 | congestion_xsum[x * Y + y] = max(congestion_xsum[x * Y + y], -resource); 421 | congestion_ysum[x * Y + y] = max(congestion_ysum[x * Y + y], -resource); 422 | } 423 | } 424 | 425 | __global__ void extract_congestionView_ysum() { 426 | extern __shared__ float sum3[]; 427 | if(threadIdx.x == 0) sum3[0] = 0; 428 | int x = blockIdx.x; // % XY; 429 | for(int y = threadIdx.x; y < Y - 1; y += blockDim.x) sum3[y + 1] = congestion_ysum[x*Y+y]; 430 | __syncthreads(); 431 | for(int d = 0; (1 << d) < Y; d++) { 432 | for(int idx = threadIdx.x; idx < Y; idx += blockDim.x) 433 | if(idx >> d & 1) sum3[idx] += sum3[(idx >> d << d) - 1]; 434 | __syncthreads(); 435 | } 436 | for(int y = threadIdx.x; y < Y; y += blockDim.x) congestion_ysum[x*Y+y] = sum3[y]; 437 | } 438 | 439 | __global__ void extract_congestionView_xsum() { 440 | extern __shared__ float sum3[]; 441 | if(threadIdx.x == 0) sum3[0] = 0; 442 | int y = blockIdx.x;// % XY; 443 | for(int x = threadIdx.x; x < X - 1; x += blockDim.x) sum3[x + 1] = congestion_xsum[x*Y+y]; 444 | __syncthreads(); 445 | for(int d = 0; (1 << d) < X; d++) { 446 | for(int idx = threadIdx.x; idx < X; idx += blockDim.x) 447 | if(idx >> d & 1) sum3[idx] += sum3[(idx >> d << d) - 1]; 448 | __syncthreads(); 449 | } 450 | for(int x = threadIdx.x; x < X; x += blockDim.x) congestion_xsum[x*Y+y] = sum3[x]; 451 | } 452 | 453 | bool is_of_net_cpu[db::MAX_NET_NUM]; 454 | 455 | pair, vector> ripup(int of_threshold) { 456 | assert(NET_NUM <= db::MAX_NET_NUM); 457 | 458 | cudaMemset(of_edge_sum, 0, sizeof(int) * L * X * Y); 459 | mark_overflow_edges<<>> (of_threshold); 460 | compute_presum_general<<>> (of_edge_sum); 461 | mark_overflow_nets<<>> (); 462 | cudaMemcpy(&is_of_net_cpu, is_of_net, sizeof(bool) * NET_NUM, cudaMemcpyDeviceToHost); 463 | vector of_nets, no_of_nets; 464 | for(int i = 0; i < NET_NUM; i++) 465 | (is_of_net_cpu[i] ? of_nets : no_of_nets).emplace_back(i); 466 | return make_pair(move(of_nets), move(no_of_nets)); 467 | } 468 | 469 | } -------------------------------------------------------------------------------- /run/evaluator.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | 22 | #define NVR_ASSERT(condition) assert(condition) 23 | 24 | enum NVR_Direction 25 | { 26 | NVR_DIR_HORIZONTAL, 27 | NVR_DIR_VERTICAL, 28 | NVR_DIR_BOTH, 29 | NVR_DIR_NONE 30 | }; 31 | 32 | 33 | class NVR_Point 34 | { 35 | public: 36 | NVR_Point() {} 37 | NVR_Point(int x, int y) {m_x = x; m_y = y;} 38 | void set(int x, int y) {m_x = x; m_y = y;} 39 | int x() const { return m_x;} 40 | int y() const { return m_y;} 41 | void set_x(int x) { m_x = x;} 42 | void set_y(int y) { m_y = y;} 43 | 44 | private: 45 | int m_x, m_y; 46 | }; 47 | 48 | class NVR_Box 49 | { 50 | public: 51 | NVR_Box() : lo(INT_MAX, INT_MAX), hi(INT_MIN, INT_MIN) {} 52 | void update(int x, int y) {update_x(x); update_y(y);} 53 | void update_x(int x) { 54 | if(x < lo.x()) { lo.set_x(x);} 55 | if(x > hi.x()) { hi.set_x(x);} 56 | } 57 | void update_y(int y) { 58 | if(y < lo.y()) { lo.set_y(y);} 59 | if(y > hi.y()) { hi.set_y(y);} 60 | } 61 | 62 | int hpwl() const {return width() + height(); } 63 | int width() const { return hi.x() - lo.x(); } 64 | int height() const { return hi.y() - lo.y(); } 65 | private: 66 | NVR_Point lo, hi; 67 | }; 68 | 69 | 70 | class NVR_Point3D 71 | { 72 | public: 73 | NVR_Point3D(unsigned x, unsigned y, unsigned z) : m_x(x), 74 | m_y(y), m_z(z) {} 75 | 76 | unsigned x() const {return m_x;} 77 | unsigned y() const {return m_y;} 78 | unsigned z() const {return m_z;} 79 | private: 80 | unsigned m_x; 81 | unsigned m_y; 82 | unsigned m_z; 83 | }; 84 | 85 | typedef NVR_Point3D NVR_Access; 86 | 87 | class NVR_Pin 88 | { 89 | public: 90 | unsigned num_accesses() const {return m_access.size();} 91 | void add_access(unsigned x, unsigned y, unsigned z) { 92 | m_access.emplace_back(x, y, z); 93 | } 94 | const std::vector &access() const {return m_access;} 95 | private: 96 | std::vector m_access; 97 | }; 98 | 99 | class NVR_Net 100 | { 101 | public: 102 | NVR_Net(const std::string &name) : m_name(name){} 103 | const std::string &name() const { return m_name;} 104 | void set_name(const std::string &name) {m_name = name;} 105 | NVR_Pin &add_pin() {m_pins.emplace_back(); return m_pins.back();} 106 | unsigned idx() const { return m_idx;} 107 | void set_idx(unsigned idx) { m_idx = idx; } 108 | const std::vector &pins() const { return m_pins; } 109 | 110 | NVR_Box &box() { return const_cast(const_cast(this)->box());} 111 | const NVR_Box &box() const { return m_box;} 112 | //private: 113 | std::string m_name; 114 | unsigned m_idx; 115 | std::vector m_pins; 116 | NVR_Box m_box; 117 | 118 | }; 119 | 120 | 121 | class NVR_Gcell 122 | { 123 | public: 124 | void incr_demand(unsigned demand) { m_demand += demand;} 125 | void decr_demand(unsigned demand) { m_demand -= demand;} 126 | void set_demand(unsigned demand) { m_demand = demand;} 127 | unsigned demand() const {return m_demand;} 128 | 129 | double capacity() const {return m_capacity;} 130 | void set_capacity(double cap) {m_capacity = cap;} 131 | private: 132 | unsigned m_demand : 16; 133 | double m_capacity; 134 | }; 135 | 136 | class NVR_GridGraph2D 137 | { 138 | public: 139 | NVR_GridGraph2D() : m_dir(NVR_DIR_NONE){}; 140 | void set_name(const std::string &name) {m_name = name;} 141 | const std::string &name() const { return m_name;} 142 | 143 | void init(unsigned gridx, unsigned gridy); 144 | void set_direction(int dr); 145 | bool is_routing_layer() const { return m_dir != NVR_DIR_NONE;} 146 | bool is_hor() const {return m_dir == NVR_DIR_HORIZONTAL; } 147 | bool is_ver() const {return m_dir == NVR_DIR_VERTICAL;} 148 | 149 | void set_min_length(double min_length) {m_min_length = min_length;} 150 | double min_length() const {return m_min_length;} 151 | void set_unit_cost(double length, double via, double overflow); 152 | double unit_length_cost() const {return m_unit_length_cost;} 153 | double unit_via_cost() const {return m_unit_via_cost;} 154 | double unit_overflow_cost() const {return m_unit_overflow_cost;} 155 | 156 | NVR_Gcell &get_gcell(unsigned x, unsigned y); 157 | private: 158 | std::string m_name; 159 | NVR_Direction m_dir; 160 | double m_min_length; 161 | double m_unit_length_cost; 162 | double m_unit_via_cost; 163 | double m_unit_overflow_cost; 164 | 165 | unsigned m_num_gridx; 166 | unsigned m_num_gridy; 167 | std::vector m_gcells; 168 | }; 169 | 170 | void NVR_GridGraph2D::set_unit_cost(double length, double via, 171 | double overflow) { 172 | m_unit_length_cost = length; 173 | m_unit_via_cost = via; 174 | m_unit_overflow_cost = overflow; 175 | } 176 | 177 | void NVR_GridGraph2D::set_direction(int dir) 178 | { 179 | if(dir == 0) { 180 | m_dir = NVR_DIR_HORIZONTAL; 181 | } else if (dir == 1) { 182 | m_dir = NVR_DIR_VERTICAL; 183 | } else { 184 | NVR_ASSERT(0); 185 | } 186 | } 187 | 188 | void NVR_GridGraph2D::init(unsigned gridx, unsigned gridy) 189 | { 190 | m_num_gridx = gridx; 191 | m_num_gridy = gridy; 192 | m_gcells.resize(gridx * gridy); 193 | } 194 | 195 | NVR_Gcell &NVR_GridGraph2D::get_gcell(unsigned x, unsigned y) 196 | { 197 | NVR_ASSERT(x < m_num_gridx && y < m_num_gridy && is_routing_layer()); 198 | if(is_hor()) { 199 | return m_gcells[y * m_num_gridx + x]; 200 | } else { 201 | return m_gcells[x * m_num_gridy + y]; 202 | } 203 | } 204 | 205 | class NVR_GridGraph 206 | { 207 | public: 208 | unsigned num_gridx() const { return m_num_gridx; }; 209 | unsigned num_gridy() const { return m_num_gridy; }; 210 | unsigned num_layer() const { return m_num_layer; }; 211 | void init(unsigned x, unsigned y, unsigned z); 212 | NVR_GridGraph2D &plane(unsigned layer) {return m_plane[layer];} 213 | const NVR_GridGraph2D &plane(unsigned layer) const {return m_plane[layer];} 214 | 215 | void init_x_coords(std::vector &coord) { m_x_coords = coord; } 216 | void init_y_coords(std::vector &coord) { m_y_coords = coord; } 217 | int cell_width(int x) const { return m_x_coords[x + 1] - m_x_coords[x];} 218 | int cell_height(int y) const { return m_y_coords[y + 1] - m_y_coords[y];} 219 | private: 220 | unsigned m_num_layer; 221 | unsigned m_num_gridx; 222 | unsigned m_num_gridy; 223 | std::vector m_plane; 224 | std::vector m_x_coords; 225 | std::vector m_y_coords; 226 | }; 227 | 228 | 229 | class NVR_DB 230 | { 231 | public: 232 | 233 | bool read_files(int argc, char *argv[]); 234 | void profile(); 235 | 236 | private: 237 | bool read_graph(const char *); 238 | bool read_nets(const char *); 239 | bool read_gr_solution(const char *); 240 | void report_statistic(); 241 | bool check_connectivity(const NVR_Net *net, 242 | std::vector< std::vector< std::vector > > &flag) const; 243 | void update_stacked_via_counter(unsigned net_idx, const std::vector &via_loc, 244 | std::vector< std::vector< std::vector > > &flag, 245 | std::vector< std::vector< std::vector > > &stacked_via_counter) const; 246 | 247 | double overflowLossFunc(double overflow, double slope); 248 | 249 | std::vector m_nets; 250 | NVR_GridGraph m_graph; 251 | std::vector layer_directions; 252 | }; 253 | 254 | void segv_handler(int sig) { 255 | void *array[1024]; 256 | size_t size; 257 | 258 | // get void*'s for all entries on the stack 259 | size = backtrace(array, 10); 260 | 261 | // print out all the frames to stderr 262 | fprintf(stderr, "Error: signal %d:\n", sig); 263 | backtrace_symbols_fd(array, size, STDERR_FILENO); 264 | exit(1); 265 | } 266 | 267 | int main(int argc, char* argv[]) { 268 | 269 | signal(SIGSEGV, segv_handler); 270 | NVR_DB rdb; 271 | if( !rdb.read_files(argc, argv)) { 272 | printf("input error\n"); 273 | return 0; 274 | } 275 | //rdb.profile(); 276 | return 1; 277 | } 278 | 279 | bool NVR_DB::read_files(int argc, char* argv[]) 280 | { 281 | if (argc < 4) { 282 | printf("Usage %s resource_file net_file GR_file", argv[0]); 283 | return false; 284 | } 285 | 286 | if(!read_graph(argv[1])) { 287 | return false; 288 | } 289 | 290 | if(!read_nets(argv[2])) { 291 | return false; 292 | } 293 | 294 | report_statistic(); 295 | 296 | if(!read_gr_solution(argv[3])) { 297 | return false; 298 | } 299 | 300 | return true; 301 | } 302 | 303 | void NVR_DB::report_statistic() 304 | { 305 | printf("Num nets = %ld\n", m_nets.size()); 306 | printf("Grid Graph Size (x, y, z)= %d x %d x %d\n", 307 | m_graph.num_gridx(), m_graph.num_gridy(), m_graph.num_layer()); 308 | return; 309 | } 310 | 311 | bool NVR_DB::read_graph(const char *input) 312 | { 313 | std::ifstream fin(input); 314 | if (!fin) { 315 | printf("Failed to open resource file.\n"); 316 | return false; 317 | } 318 | 319 | unsigned num_gridx, num_gridy, num_layers; 320 | fin >> num_layers; 321 | fin >> num_gridx; 322 | fin >> num_gridy; 323 | m_graph.init(num_gridx, num_gridy, num_layers); 324 | 325 | //printf("tmp1 %d %d %d\n", num_gridx, num_gridy, num_layers); 326 | 327 | double unit_length_cost, unit_via_cost; 328 | std::vector overflow_costs; 329 | fin >> unit_length_cost; 330 | fin >> unit_via_cost; 331 | overflow_costs.resize(num_layers); 332 | layer_directions.resize(num_layers); 333 | for(unsigned z = 0; z < num_layers; z++) { 334 | fin >> overflow_costs[z]; 335 | NVR_GridGraph2D &plane = m_graph.plane(z); 336 | plane.set_unit_cost(unit_length_cost, 337 | unit_via_cost, overflow_costs[z]); 338 | 339 | // printf("z=%d lengtjh_cost=%.2lf via_cost=%.2lf overflow=%.2lf\n", 340 | // unit_length_cost, unit_via_cost, overflow_costs[z]); 341 | } 342 | 343 | int gcell_length; 344 | std::vector coords; 345 | coords.resize(num_gridx); 346 | coords[0] = 0; 347 | for(unsigned x = 0; x < num_gridx - 1; x++) { 348 | fin >> gcell_length; 349 | coords[x + 1] = coords[x] + gcell_length; 350 | // printf("1 length=%d\n", gcell_length); 351 | } 352 | m_graph.init_x_coords(coords); 353 | 354 | coords.resize(num_gridy, 0); 355 | coords[0] = 0; 356 | for(unsigned y = 0; y < num_gridy - 1; y++) { 357 | fin >> gcell_length; 358 | coords[y + 1] = coords[y] + gcell_length; 359 | //printf("2 length=%d\n", gcell_length); 360 | } 361 | m_graph.init_y_coords(coords); 362 | 363 | 364 | 365 | std::string line; 366 | std::string layer_name; 367 | int direction; 368 | double capacity; 369 | double min_length; 370 | std::getline(fin, line); 371 | for(unsigned z = 0; z < num_layers; z++ ) { 372 | std::getline(fin, line); 373 | std::istringstream info(line); 374 | info >> layer_name; 375 | info >> direction; 376 | info >> min_length; 377 | NVR_GridGraph2D &plane = m_graph.plane(z); 378 | layer_directions[z] = direction; 379 | 380 | plane.set_name(layer_name); 381 | plane.set_min_length(min_length); 382 | printf("layer %d name=%s min_length=%.2lf dir=%d\n", 383 | z, layer_name.c_str(), min_length, direction); 384 | 385 | if(z != 0) { 386 | plane.set_direction(direction); 387 | if(plane.is_routing_layer()) { 388 | plane.init(num_gridx, num_gridy); 389 | } 390 | } 391 | 392 | for(unsigned y = 0; y < num_gridy; y++) { 393 | std::getline(fin, line); 394 | std::istringstream info(line); 395 | for(unsigned x = 0; x < num_gridx; x++) { 396 | info >> capacity; 397 | if(plane.is_routing_layer()) { 398 | plane.get_gcell(x, y).set_capacity(capacity); 399 | } 400 | } 401 | } 402 | } 403 | 404 | return true; 405 | } 406 | 407 | bool NVR_DB::read_nets(const char *input) 408 | { 409 | std::ifstream net_file(input); 410 | if (!net_file) { 411 | printf("Failed to open net file.\n"); 412 | return false; 413 | } 414 | 415 | m_nets.reserve(1000); 416 | std::string line; 417 | std::string redundant_chars = "(),[]"; 418 | while (std::getline(net_file, line)) { 419 | if (line.find("(") == std::string::npos && line.find(")") 420 | == std::string::npos && line.length()>1) { //start to read a net 421 | size_t found = line.find('\n'); 422 | if (found != std::string::npos) { 423 | line.erase(found, 1); 424 | } 425 | m_nets.emplace_back(line); 426 | } else if (line.find('[') != std::string::npos) { //read pins 427 | NVR_Net &net = m_nets.back(); 428 | net.set_idx(m_nets.size() - 1); 429 | line.erase(std::remove_if(line.begin(), line.end(), [&redundant_chars](char c) { 430 | return redundant_chars.find(c) != std::string::npos; 431 | }), line.end()); 432 | std::istringstream ss(line); 433 | 434 | NVR_Pin &pin = net.add_pin(); 435 | int x, y, z; 436 | while (ss >> z >> x >> y) { 437 | //printf("access (x, y, z) = (%d, %d %d)\n", x, y, z); 438 | pin.add_access(x, y, z); 439 | net.box().update(x, y); 440 | } 441 | } 442 | } 443 | 444 | /* 445 | for(const NVR_Net &net : m_nets) { 446 | printf("net %s\n", net.name().c_str()); 447 | for(const NVR_Pin &pin : net.pins()) { 448 | for(const NVR_Access &access : pin.access()) { 449 | printf("(%d, %d, %d) ", access.x(), access.y(), access.z()); 450 | } 451 | printf("\n"); 452 | } 453 | } 454 | */ 455 | return true; 456 | } 457 | 458 | bool NVR_DB::read_gr_solution(const char *input) 459 | { 460 | std::ifstream fin(input); 461 | if (!fin) { 462 | printf("Failed to open solution file.\n"); 463 | return false; 464 | } 465 | 466 | std::unordered_map net_mapper; 467 | std::unordered_map net_completed; 468 | for(NVR_Net &net : m_nets) { 469 | net_mapper[net.name()] = &net; 470 | net_completed[net.name()] = false; 471 | } 472 | 473 | unsigned long total_opens = 0; 474 | std::vector total_vias(m_graph.num_layer(), 0); 475 | std::vector< std::vector< std::vector > > flag; 476 | std::vector< std::vector< std::vector > > wire_counter; 477 | std::vector< std::vector< std::vector > > stacked_via_counter; 478 | 479 | flag.resize(m_graph.num_layer()); 480 | wire_counter.resize(m_graph.num_layer()); 481 | stacked_via_counter.resize(m_graph.num_layer()); 482 | for(unsigned z = 0; z < m_graph.num_layer(); z++) { 483 | flag[z].resize(m_graph.num_gridx()); 484 | wire_counter[z].resize(m_graph.num_gridx()); 485 | stacked_via_counter[z].resize(m_graph.num_gridx()); 486 | for(unsigned x = 0; x < m_graph.num_gridx(); x++) { 487 | flag[z][x].resize(m_graph.num_gridy(), -1); 488 | wire_counter[z][x].resize(m_graph.num_gridy(), 0); 489 | stacked_via_counter[z][x].resize(m_graph.num_gridy(), 0); 490 | } 491 | } 492 | 493 | std::vector via_loc; 494 | bool has_connectivity_violation = false; 495 | NVR_Net *net = NULL; 496 | std::string line; 497 | while(std::getline(fin, line)) { 498 | //printf("read %s\n", line.c_str()); 499 | if(!net) { 500 | net = net_mapper[line]; 501 | has_connectivity_violation = false; 502 | } else if(line[0] == '(') { 503 | } else if(line[0] == ')') { 504 | update_stacked_via_counter(net->idx(), via_loc, flag, stacked_via_counter); 505 | if(has_connectivity_violation) { 506 | total_opens++; 507 | } else { 508 | NVR_ASSERT(net); 509 | if(!check_connectivity(net, flag)) { 510 | std::cerr << net->name() << std::endl; 511 | exit(0); 512 | total_opens++; 513 | } else { 514 | net_completed[net->name()] = true; 515 | } 516 | } 517 | net = NULL; 518 | via_loc.clear(); 519 | } else { 520 | //printf("wire %s\n", line.c_str()); 521 | std::istringstream ss(line); 522 | int xl, yl, zl, xh, yh, zh; 523 | ss >> xl >> yl >> zl >> xh >> yh >> zh; 524 | //printf("(%d, %d, %d) (%d, %d, %d)\n", xl, yl, zl, xh, yh, zh); 525 | if(zh != zl) { // via 526 | if(xh == xl && yh == yl) { 527 | for(unsigned z = zl; z < zh; z++) { 528 | total_vias[z]++; 529 | via_loc.emplace_back(xl, yl, z); 530 | } 531 | // flag[zh][xl][yl] = net->idx(); 532 | } else { 533 | NVR_ASSERT(0); 534 | has_connectivity_violation = true; 535 | } 536 | } else { //wire 537 | NVR_GridGraph2D &plane = m_graph.plane(zl); 538 | if(plane.is_hor()) { 539 | if(xh > xl && yh == yl) { 540 | for(unsigned x = xl; x < xh; x++) { 541 | flag[zl][x][yl] = net->idx(); 542 | wire_counter[zl][x][yl]++; 543 | } 544 | flag[zl][xh][yl] = net->idx(); 545 | } else { 546 | NVR_ASSERT(0); 547 | has_connectivity_violation = true; 548 | } 549 | } else if(plane.is_ver()) { 550 | if(yh > yl && xh == xl) { 551 | for(unsigned y = yl; y < yh; y++) { 552 | flag[zl][xl][y] = net->idx(); 553 | wire_counter[zl][xl][y]++; 554 | } 555 | flag[zl][xl][yh] = net->idx(); 556 | } else { 557 | NVR_ASSERT(0); 558 | has_connectivity_violation = true; 559 | } 560 | } else { //unroutable layer 561 | NVR_ASSERT(0); 562 | has_connectivity_violation = true; 563 | } 564 | } 565 | } 566 | } 567 | double wl_cost = 0; 568 | double via_cost = 0; 569 | double overflow_cost = 0; 570 | double overflow_slope = 0.5; 571 | 572 | for(unsigned z = 0; z < m_graph.num_layer(); z++) { 573 | NVR_GridGraph2D &gg = m_graph.plane(z); 574 | if(!gg.is_routing_layer()) { 575 | via_cost += double(total_vias[z]) * gg.unit_via_cost(); 576 | continue; 577 | } 578 | 579 | unsigned long long total_wl = 0; 580 | double layer_overflows = 0; 581 | double overflow = 0; 582 | for(unsigned x = 0; x < m_graph.num_gridx(); x++) { 583 | for(unsigned y = 0; y < m_graph.num_gridy(); y++) { 584 | NVR_Gcell &cell = gg.get_gcell(x, y); 585 | int demand = 2 * wire_counter[z][x][y] + stacked_via_counter[z][x][y]; 586 | //if(z == 1) printf("(%d, %d): %.1f\n", x, y, demand * 0.5); 587 | //if(z == 1 && x == 14 && y == 226) std::cerr << "WIRE/VIA COUNT " << wire_counter[z][x][y] << ' ' << stacked_via_counter[z][x][y] << std::endl; 588 | //if(z == 1 && x == 24 && y == 297) printf("WIRE COUNT = %d VIA COUNT = %d\n", wire_counter[z][x][y], stacked_via_counter[z][x][y]); 589 | cell.set_demand(demand); 590 | 591 | if (cell.capacity() > 0.001 ){ 592 | overflow = double(cell.demand()) - 2 * cell.capacity(); 593 | layer_overflows += overflowLossFunc(overflow/2, overflow_slope); 594 | } else if (cell.capacity() >= 0 && cell.demand() > 0) { 595 | layer_overflows += overflowLossFunc(1.5 * double(cell.demand()), overflow_slope); 596 | } else if (cell.capacity() < 0) { 597 | printf("Capacity error (%d, %d, %d)\n", x, y, z); 598 | } 599 | 600 | if(gg.is_hor()) { 601 | total_wl += wire_counter[z][x][y] * m_graph.cell_width(x); 602 | } else if(gg.is_ver()) { 603 | total_wl += wire_counter[z][x][y] * m_graph.cell_height(y); 604 | } 605 | } 606 | } 607 | // overflow_cost += double(num_overflows) * 0.5 * gg.unit_overflow_cost(); 608 | overflow_cost += layer_overflows * gg.unit_overflow_cost(); 609 | via_cost += double(total_vias[z]) * gg.unit_via_cost(); 610 | wl_cost += double(total_wl) * gg.unit_length_cost(); 611 | printf("Layer = %d, layer overflow cost = %lf (unit cost = %.2f)\n", z, layer_overflows, gg.unit_overflow_cost()); 612 | //printf("Layer = %d, layer_overflows = %lf, overflow cost = %lf\n", z, layer_overflows, overflow_cost); 613 | } 614 | 615 | unsigned long total_incompleted = 0; 616 | for (auto & [key, value] : net_completed) { 617 | if (value == false) { 618 | total_incompleted++; 619 | } 620 | } 621 | 622 | double total_cost = overflow_cost + via_cost + wl_cost; 623 | assert(total_opens == 0); 624 | printf("Number of open nets : %lu\n", total_opens); 625 | printf("Number of incompleted nets : %lu\n", total_incompleted); 626 | printf("wirelength cost %.4lf\n", wl_cost); 627 | printf("via cost %.4lf\n", via_cost); 628 | printf("overflow cost %.4lf\n", overflow_cost); 629 | printf("total cost %.4lf\n", total_cost); 630 | 631 | printf("FOR_STAT %.0f %.0f %.0f %.0f\n", wl_cost, via_cost, overflow_cost, total_cost); 632 | return true; 633 | } 634 | 635 | void NVR_DB::update_stacked_via_counter(unsigned net_idx, 636 | const std::vector &via_loc, 637 | std::vector< std::vector< std::vector > > &flag, 638 | std::vector< std::vector< std::vector > > &stacked_via_counter) const 639 | { 640 | for(const NVR_Point3D &pp : via_loc) { 641 | if(flag[pp.z()][pp.x()][pp.y()] != net_idx) { 642 | //if(pp.z() == 1 && pp.x() == 11 && (pp.y() == 268 || pp.y() == 269)) 643 | // std::cerr << "NET NAME " << m_nets[net_idx].m_name << std::endl << pp.z() << ' ' << pp.x() << ' ' << pp.y() << std::endl; 644 | flag[pp.z()][pp.x()][pp.y()] = net_idx; 645 | 646 | int direction = layer_directions[pp.z()]; 647 | if(direction == 0) { 648 | if ((pp.x() > 0) && (pp.x() < m_graph.num_gridx() - 1)) { 649 | stacked_via_counter[pp.z()][pp.x()-1][pp.y()]++; 650 | stacked_via_counter[pp.z()][pp.x()][pp.y()]++; 651 | } else if (pp.x() > 0 ) { 652 | stacked_via_counter[pp.z()][pp.x()-1][pp.y()] += 2; 653 | } else if (pp.x() < m_graph.num_gridx() - 1) { 654 | stacked_via_counter[pp.z()][pp.x()][pp.y()] += 2; 655 | } 656 | } else if (direction == 1) { 657 | if ((pp.y() > 0) && (pp.y() < m_graph.num_gridy() - 1)) { 658 | stacked_via_counter[pp.z()][pp.x()][pp.y()-1]++; 659 | stacked_via_counter[pp.z()][pp.x()][pp.y()]++; 660 | } else if (pp.y() > 0 ) { 661 | stacked_via_counter[pp.z()][pp.x()][pp.y()-1] += 2; 662 | } else if (pp.y() < m_graph.num_gridy() - 1) { 663 | stacked_via_counter[pp.z()][pp.x()][pp.y()] += 2; 664 | } 665 | } 666 | //stacked_via_counter[pp.z()][pp.x()][pp.y()]++; 667 | } 668 | 669 | } 670 | 671 | for(const NVR_Point3D &pp : via_loc) { 672 | flag[pp.z()][pp.x()][pp.y()] = net_idx; 673 | flag[pp.z()+1][pp.x()][pp.y()] = net_idx; 674 | } 675 | } 676 | 677 | bool NVR_DB::check_connectivity(const NVR_Net *net, 678 | std::vector< std::vector< std::vector > > &flag) const 679 | { 680 | int mark = net->idx(); 681 | int traced_mark = net->idx() + m_nets.size(); 682 | NVR_ASSERT(net->pins().size()); 683 | //printf("net pins %d\n", net->pins().size()); 684 | std::vector stack; 685 | for( const NVR_Access &ac : net->pins()[0].access()) { 686 | //printf("access %d %d %d\n", ac.x(), ac.y(), ac.z()); 687 | if(flag[ac.z()][ac.x()][ac.y()] == mark) { 688 | flag[ac.z()][ac.x()][ac.y()] = traced_mark; 689 | stack.emplace_back(ac); 690 | } 691 | } 692 | while(!stack.empty()) { 693 | NVR_Point3D pp = stack.back(); 694 | stack.pop_back(); 695 | //printf("(%d %d %d)\n", pp.x(), pp.y(), pp.z()); 696 | const NVR_GridGraph2D &gg = m_graph.plane(pp.z()); 697 | if(gg.is_hor()) { 698 | //printf("west\n"); 699 | if(pp.x() > 0 && flag[pp.z()][pp.x() - 1][pp.y()] == mark) { 700 | flag[pp.z()][pp.x() - 1][pp.y()] = traced_mark; 701 | stack.emplace_back(pp.x() - 1, pp.y(), pp.z()); 702 | } 703 | //printf("east\n"); 704 | if(pp.x() < m_graph.num_gridx() - 1 && flag[pp.z()][pp.x() + 1][pp.y()] == mark) { //west 705 | flag[pp.z()][pp.x() + 1][pp.y()] = traced_mark; 706 | stack.emplace_back(pp.x() + 1, pp.y(), pp.z()); 707 | } 708 | } else if(gg.is_ver()) { 709 | //printf("south\n"); 710 | if(pp.y() > 0 && flag[pp.z()][pp.x()][pp.y() - 1] == mark) { 711 | flag[pp.z()][pp.x()][pp.y() - 1] = traced_mark; 712 | stack.emplace_back(pp.x(), pp.y() - 1, pp.z()); 713 | } 714 | //printf("north\n"); 715 | if(pp.y() < m_graph.num_gridy() - 1 && flag[pp.z()][pp.x()][pp.y() + 1] == mark) { //west 716 | flag[pp.z()][pp.x()][pp.y() + 1] = traced_mark; 717 | stack.emplace_back(pp.x(), pp.y() + 1, pp.z()); 718 | } 719 | } 720 | 721 | //printf("down\n"); 722 | if(pp.z() > 0 && flag[pp.z() - 1][pp.x()][pp.y()] == mark) { 723 | flag[pp.z() - 1][pp.x()][pp.y()] = traced_mark; 724 | stack.emplace_back(pp.x(), pp.y(), pp.z() - 1); 725 | } 726 | 727 | //printf("up\n"); 728 | if(pp.z() < m_graph.num_layer() - 1 && flag[pp.z() + 1][pp.x()][pp.y()] == mark) { //west 729 | flag[pp.z() + 1][pp.x()][pp.y()] = traced_mark; 730 | stack.emplace_back(pp.x(), pp.y(), pp.z() + 1); 731 | } 732 | } 733 | 734 | //printf("end propagate\n"); 735 | for(unsigned i = 1; i < net->pins().size(); i++) { 736 | bool connected = false; 737 | for( const NVR_Access &ac : net->pins()[i].access()) { 738 | if(flag[ac.z()][ac.x()][ac.y()] == traced_mark) { 739 | connected = true; 740 | break; 741 | } 742 | } 743 | if(!connected) { 744 | return false; 745 | } 746 | } 747 | 748 | return true; 749 | } 750 | 751 | void NVR_DB::profile() 752 | { 753 | NVR_ASSERT(m_nets.size()); 754 | int max_hpwl = 0; 755 | double avg_hpwl = 0; 756 | std::vector sorted_hpwl; 757 | sorted_hpwl.reserve(m_nets.size()); 758 | for(const NVR_Net &net : m_nets) { 759 | int hpwl = net.box().hpwl(); 760 | max_hpwl = std::max(max_hpwl, hpwl); 761 | avg_hpwl += hpwl; 762 | sorted_hpwl.emplace_back(hpwl); 763 | } 764 | avg_hpwl = avg_hpwl / double(m_nets.size()); 765 | 766 | int top_10percent = m_nets.size() / 10; 767 | std::sort(sorted_hpwl.begin(), sorted_hpwl.end()); 768 | double sum_10percent = std::accumulate(sorted_hpwl.begin(), sorted_hpwl.end(), 0); 769 | double avg_top_10percent = sum_10percent / double(top_10percent); 770 | 771 | 772 | printf("max_hpwl=%d avg_hpwl=%.2lf avg_hpwl[10%%]=%.2lf\n", 773 | max_hpwl, avg_hpwl, avg_top_10percent); 774 | return; 775 | } 776 | 777 | 778 | void NVR_GridGraph::init(unsigned x, unsigned y, unsigned z) 779 | { 780 | m_num_gridx = x; 781 | m_num_gridy = y; 782 | m_num_layer = z; 783 | m_plane.resize(z); 784 | } 785 | 786 | double NVR_DB::overflowLossFunc(double overflow, double slope) 787 | { 788 | return exp(overflow * slope); 789 | } 790 | 791 | -------------------------------------------------------------------------------- /src/Lshape_route_detour.hpp: -------------------------------------------------------------------------------- 1 | #include "graph.hpp" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #define INF_LAYER 20 7 | #define MAX_LAYER 10 8 | #define MIN_ROUTE_LAYER 1 9 | #define MAX_DEPTH 5000 10 | __managed__ double *cost_edges; 11 | __managed__ int *best_change; 12 | __managed__ int edge_cnt; 13 | 14 | namespace Lshape_route_detour { 15 | 16 | //declaration 17 | void Lshape_route_detour(vector &nets2route); 18 | __managed__ int *macroBorder; 19 | int *macroBorder_cpu; 20 | int cntCongested = 0; 21 | int totalEdgeNum = 0; 22 | 23 | __managed__ int *node_cnt_sum, *nodes, *par_nodes, *from, *layer_range; 24 | int node_cnt_estimate; 25 | int parent_cnt_estimate; 26 | __managed__ int *child_num; 27 | __managed__ int *child_num_sum; 28 | __managed__ int *in_degree; 29 | __managed__ int *currentChildIDX; 30 | 31 | __managed__ int *par_num; 32 | __managed__ int *par_num_sum; 33 | __managed__ int *locks; 34 | __managed__ double *childCosts; 35 | __managed__ int *childCosts_road; 36 | 37 | __managed__ int *best_path; 38 | 39 | __managed__ int *layer_output; 40 | __managed__ double *costs; 41 | __managed__ int *fixed_layers; 42 | __managed__ int *node_net_idx; 43 | __managed__ int *node_net_idx2; 44 | 45 | __managed__ int *lock_gpu; 46 | 47 | __managed__ int *node_depth; 48 | __managed__ int *net_depth; 49 | 50 | __managed__ int *batch_depth; 51 | __managed__ int *depth_node; 52 | __managed__ int *depth_node_cnt; 53 | 54 | bool *congestionView_cpu; 55 | float *congestionView_xsum_cpu; 56 | float *congestionView_ysum_cpu; 57 | int *node_cnt_sum_cpu, *node_depth_cpu, *net_depth_cpu, *batch_depth_cnt_cpu, *depth_node_cnt_cpu, *depth_node_cpu, 58 | *nodes_cpu, *node_net_idx_cpu, *node_net_idx2_cpu, *child_num_cpu, *child_num_sum_cpu, *par_num_cpu, *par_num_sum_cpu, 59 | *par_nodes_cpu, *currentChildIDX_cpu, *depthID2nodeSequence; 60 | 61 | __device__ void atomicMinDouble(double *address, double val) { 62 | unsigned long long int* address_as_ull = (unsigned long long int*)address; 63 | unsigned long long int old = *address_as_ull, assumed; 64 | do { 65 | assumed = old; 66 | if (__longlong_as_double(assumed) <= val) { 67 | break; 68 | } 69 | old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val)); 70 | } while (assumed != old); 71 | } 72 | 73 | __global__ void init_min_child_costs(int limit) { 74 | int index = blockIdx.x * blockDim.x + threadIdx.x; 75 | childCosts[index] = INF; 76 | } 77 | 78 | __global__ void init_road(int limit) { 79 | int index = blockIdx.x * blockDim.x + threadIdx.x; 80 | childCosts_road[index] = 200000000; 81 | } 82 | 83 | __global__ void init_costs(int limit) { 84 | int index = blockIdx.x * blockDim.x + threadIdx.x; 85 | if(index>limit) 86 | { 87 | return; 88 | } 89 | costs[index] = INF; 90 | } 91 | 92 | __global__ void Lshape_route_node_cuda(int shift, int end_shift) { 93 | int node_sequence = blockIdx.x * blockDim.x + threadIdx.x + shift; 94 | if(node_sequence>=end_shift) 95 | { 96 | return; 97 | } 98 | int node_idx = depth_node[node_sequence]; 99 | int parent_num_cur = par_num_sum[node_idx+1]-par_num_sum[node_idx]; 100 | int fixed_layer_low = 1 + nodes[node_idx] / X / Y; 101 | int x = nodes[node_idx] / Y % X, y = nodes[node_idx] % Y; 102 | int fixed_layer_high = fixed_layer_low==10?0:fixed_layer_low; 103 | int cur_child_num = child_num_sum[node_idx+1]-child_num_sum[node_idx]; 104 | 105 | int *cur_best_path = best_path + child_num_sum[node_idx] * MAX_LAYER; 106 | double *cur_childCosts = childCosts + child_num_sum[node_idx] * MAX_LAYER; 107 | int *cur_childCosts_road = childCosts_road + child_num_sum[node_idx] * MAX_LAYER; 108 | double minChildCosts[6]; 109 | int bestPaths[6]; 110 | for (int lowLayerIndex = MIN_ROUTE_LAYER; lowLayerIndex <= fixed_layer_low; lowLayerIndex++) { 111 | for(int cid=0; cidlowLayerIndex) 118 | { 119 | // min value of lowLayerIndex is 1 120 | via_cost += vcost[IDX(layerIndex - 2, x, y)]; 121 | } 122 | // int min_layer = 10; 123 | for (int childIndex = 0; childIndex < cur_child_num; childIndex++) { 124 | double cur_child_cost = cur_childCosts[childIndex * MAX_LAYER + layerIndex]; 125 | if (cur_child_cost < minChildCosts[childIndex]) { 126 | minChildCosts[childIndex] = cur_child_cost; 127 | bestPaths[childIndex] = cur_childCosts_road[childIndex * MAX_LAYER + layerIndex] * MAX_LAYER + layerIndex; 128 | } 129 | } 130 | if (layerIndex >= fixed_layer_high) { 131 | double cost = via_cost; 132 | for (int childIndex = 0; childIndex < cur_child_num; childIndex++) 133 | { 134 | cost += minChildCosts[childIndex]; 135 | } 136 | if (cost= lowLayerIndex; layerIndex--) {// 146 | if (costs[node_idx*MAX_LAYER+layerIndex + 1] < costs[node_idx*MAX_LAYER+layerIndex]) { 147 | costs[node_idx*MAX_LAYER+layerIndex] = costs[node_idx*MAX_LAYER+layerIndex + 1]; 148 | for (int childIndex = 0; childIndex < cur_child_num; childIndex++) 149 | { 150 | cur_best_path[childIndex * MAX_LAYER + layerIndex] = cur_best_path[childIndex * MAX_LAYER + layerIndex+1]; 151 | } 152 | } 153 | } 154 | } 155 | int node_x = nodes[node_idx] / Y % X, node_y = nodes[node_idx] % Y; 156 | 157 | for(int par_id = 0; par_id < parent_num_cur; par_id++) 158 | { 159 | int parent_IDX = par_nodes[par_num_sum[node_idx] + par_id]; 160 | int child_index_of_current_node = currentChildIDX[par_num_sum[node_idx] + par_id]%10; 161 | double *parent_childCosts = childCosts + child_num_sum[parent_IDX] * MAX_LAYER; 162 | int *parent_childCosts_road = childCosts_road + child_num_sum[parent_IDX] * MAX_LAYER; 163 | int px = nodes[parent_IDX] / Y % X, py = nodes[parent_IDX] % Y; 164 | assert(px==node_x||py==node_y); 165 | for(int layer = MIN_ROUTE_LAYER; layer=end_shift) 184 | { 185 | return; 186 | } 187 | int node_id = depth_node[node_sequence]; 188 | int net_id = node_net_idx[node_id]; 189 | int *net_routes = routes + pin_acc_num[net_id] * ROUTE_PER_PIN; 190 | int *cur_best_path = best_path + child_num_sum[node_id] * MAX_LAYER; 191 | 192 | int l = nodes[node_id] / Y / X; 193 | int cur_x = nodes[node_id] / Y % X, cur_y = nodes[node_id] % Y; 194 | if(par_num_sum[node_id+1]-par_num_sum[node_id]==0) 195 | { 196 | int min_layer = 0; 197 | double min_cost = costs[node_id * MAX_LAYER]; 198 | for(int layer = 1; layer < MAX_LAYER; layer++) 199 | { 200 | if(costs[node_id * MAX_LAYER + layer] < min_cost) 201 | { 202 | min_cost = costs[ node_id * MAX_LAYER + layer]; 203 | min_layer = layer; 204 | } 205 | } 206 | layer_output[node_id] = min_layer; 207 | net_routes[0] = 1; 208 | } else{ 209 | int par_layer = -1; 210 | int par_idx = -1; 211 | int par_sequence = -1; 212 | for(int par_id = 0; par_id< par_num_sum[node_id+1]-par_num_sum[node_id]; par_id++) 213 | { 214 | int par_node = par_nodes[par_num_sum[node_id]+par_id]; 215 | if(layer_output[par_node]>=0) 216 | { 217 | par_idx = par_node; 218 | par_sequence = par_id; 219 | par_layer = layer_output[par_node]; 220 | int child_index_of_current_node = currentChildIDX[par_num_sum[node_id]+par_sequence]%10; 221 | int *par_best_path = best_path + child_num_sum[par_idx] * MAX_LAYER; 222 | int path = par_best_path[child_index_of_current_node * MAX_LAYER + par_layer]; 223 | int child_idx = path / MAX_LAYER; 224 | if(child_idx == node_id) 225 | { 226 | layer_output[node_id] = path % MAX_LAYER; 227 | int px = nodes[par_idx] / Y % X, py = nodes[par_idx] % Y; 228 | assert(px==cur_x||py==cur_y); 229 | if(px==cur_x && cur_y!=py) 230 | { 231 | graph::atomic_add_unit_demand_wire_segment(layer_output[node_id] - 1, px, px, min(py,cur_y), max(py,cur_y), stamp); 232 | int idd1 = atomicAdd(net_routes,2); 233 | net_routes[idd1] = IDX(layer_output[node_id] - 1, px, min(py,cur_y)); 234 | net_routes[idd1+1] = IDX(layer_output[node_id] - 1, px, max(py,cur_y)); 235 | } 236 | else if(py==cur_y && cur_x != px) 237 | { 238 | graph::atomic_add_unit_demand_wire_segment(layer_output[node_id] - 1, min(px,cur_x), max(px,cur_x), py, py, stamp); 239 | int idd1 = atomicAdd(net_routes,2); 240 | net_routes[idd1] = IDX(layer_output[node_id] - 1, min(px,cur_x), py); 241 | net_routes[idd1+1] = IDX(layer_output[node_id] - 1, max(px,cur_x), py); 242 | } 243 | break; 244 | }else{ 245 | layer_output[node_id] = -1; 246 | } 247 | } 248 | } 249 | if(par_layer==-1) 250 | { 251 | layer_output[node_id] = -1; 252 | return; 253 | } 254 | int child_index_of_current_node = currentChildIDX[par_num_sum[node_id]+par_sequence]%10; 255 | int *par_best_path = best_path + child_num_sum[par_idx] * MAX_LAYER; 256 | int path = par_best_path[child_index_of_current_node * MAX_LAYER + par_layer]; 257 | int child_idx = path / MAX_LAYER; 258 | if( child_idx != node_id) 259 | { 260 | layer_output[node_id] = -1; 261 | return; 262 | } 263 | } 264 | int num_child = child_num_sum[node_id+1] - child_num_sum[node_id]; 265 | int minl = l+1; 266 | int maxl = (l+1)==MAX_LAYER?1:minl; 267 | minl = min(minl,layer_output[node_id]); 268 | maxl = max(maxl,layer_output[node_id]); 269 | assert(num_child>=0); 270 | if(num_child>0) 271 | { 272 | for(int child_id=0; child_id &nets2route, int thread_num, std::atomic& currentNetId) { 289 | while (true) { 290 | int netId = currentNetId.fetch_add(1); 291 | if (netId >= nets2route.size()) { 292 | break; 293 | } 294 | nets[nets2route[netId]].generate_detours(move(congestionView_cpu), move(congestionView_xsum_cpu), move(congestionView_ysum_cpu), false); 295 | } 296 | } 297 | 298 | void multithreaded_processing(vector &nets2route) { 299 | std::vector threads; 300 | int max_threads = 8; 301 | threads.reserve(max_threads); 302 | std::atomic currentNetId(0); 303 | for (int i = 0; i < max_threads; ++i) { 304 | threads.emplace_back(process_net, i, std::ref(nets2route), max_threads, std::ref(currentNetId)); 305 | } 306 | for (auto& t : threads) { 307 | t.join(); 308 | } 309 | } 310 | 311 | void Lshape_route_detour_wrap(vector &nets2route) 312 | { 313 | double DAG_start_time = elapsed_time(); 314 | if (nets2route.size() == 0) 315 | { 316 | return; 317 | } 318 | sort(nets2route.begin(), nets2route.end(), [](int l, int r) 319 | { return nets[l].hpwl > nets[r].hpwl; }); 320 | congestionView_cpu = new bool[X * Y * sizeof(bool)]; 321 | congestionView_xsum_cpu = new float[X * Y * sizeof(float)]; 322 | congestionView_ysum_cpu = new float[X * Y * sizeof(float)]; 323 | cudaMemcpy(congestionView_cpu, congestion, X * Y * sizeof(bool), cudaMemcpyDeviceToHost); 324 | cudaMemcpy(congestionView_xsum_cpu, congestion_xsum, X * Y * sizeof(float), cudaMemcpyDeviceToHost); 325 | cudaMemcpy(congestionView_ysum_cpu, congestion_ysum, X * Y * sizeof(float), cudaMemcpyDeviceToHost); 326 | for (int i = 0; i < nets2route.size(); i++) 327 | { 328 | if (nets[nets2route[i]].rsmt.size() < 1) 329 | { 330 | nets[nets2route[i]].construct_rsmt(); 331 | } 332 | } 333 | multithreaded_processing(nets2route); 334 | if(LOG) printf("[%5.1f] Generating batches Starts\n", elapsed_time()); 335 | auto batches = generate_batches_rsmt(nets2route, 300000); 336 | if(LOG) printf("[%5.1f] Generating batches Ends\n", elapsed_time()); 337 | int net_cnt_estimate = 0; 338 | int node_num_max = 0; 339 | int par_num_max = 0; 340 | for (int ii = 0; ii < batches.size(); ii++) 341 | { 342 | int tmp = 0; 343 | int tmp2 = 0; 344 | if (batches[ii].size() > net_cnt_estimate) 345 | { 346 | net_cnt_estimate = batches[ii].size(); 347 | } 348 | for (int j = 0; j < batches[ii].size(); j++) 349 | { 350 | auto &graph_x = nets[batches[ii][j]].rsmt; 351 | tmp += nets[batches[ii][j]].node_index_cnt; 352 | tmp2 += nets[batches[ii][j]].par_num_sum_cpu[nets[batches[ii][j]].node_index_cnt]; 353 | } 354 | if (tmp > node_num_max) 355 | { 356 | node_num_max = tmp; 357 | } 358 | if (tmp2 > par_num_max) 359 | { 360 | par_num_max = tmp2; 361 | } 362 | } 363 | 364 | net_cnt_estimate += 5; 365 | node_cnt_estimate = node_num_max + 10; 366 | parent_cnt_estimate = par_num_max + 10; 367 | if(LOG) printf("[%5.1f] Lshape_route Starts\n", elapsed_time()); 368 | cudaMalloc(&node_cnt_sum, net_cnt_estimate * sizeof(int)); 369 | cudaMalloc(&nodes, node_cnt_estimate * sizeof(int)); 370 | cudaMalloc(&net_depth, net_cnt_estimate * sizeof(int)); 371 | cudaMalloc(&batch_depth, (batches.size() + 1) * sizeof(int)); 372 | cudaMalloc(&child_num_sum, node_cnt_estimate * sizeof(int)); 373 | cudaMalloc(&par_num_sum, node_cnt_estimate * sizeof(int)); 374 | cudaMalloc(&node_net_idx, node_cnt_estimate * sizeof(int)); 375 | cudaMalloc(&node_net_idx2, node_cnt_estimate * sizeof(int)); 376 | cudaMalloc(&node_depth, node_cnt_estimate * sizeof(int)); 377 | cudaMalloc(&depth_node, node_cnt_estimate * sizeof(int)); 378 | cudaMalloc(&layer_range, node_cnt_estimate * sizeof(int)); 379 | cudaMalloc(&costs, node_cnt_estimate * MAX_LAYER * sizeof(double)); 380 | cudaMalloc(&locks, parent_cnt_estimate * MAX_LAYER * sizeof(int)); 381 | cudaMemset(locks, 0, sizeof(int) * parent_cnt_estimate * MAX_LAYER); 382 | cudaMalloc(&layer_output, node_cnt_estimate * sizeof(int)); 383 | cudaMalloc(&par_nodes, parent_cnt_estimate * sizeof(int)); 384 | cudaMalloc(&cost_edges, parent_cnt_estimate * 81 * sizeof(double)); 385 | cudaMalloc(&best_change, parent_cnt_estimate * 81 * sizeof(int)); 386 | cudaMalloc(&best_path, parent_cnt_estimate * MAX_LAYER * sizeof(int)); 387 | cudaMalloc(&childCosts, parent_cnt_estimate * MAX_LAYER * sizeof(double)); 388 | cudaMalloc(&childCosts_road, parent_cnt_estimate * MAX_LAYER * sizeof(int)); 389 | cudaMalloc(¤tChildIDX, parent_cnt_estimate * sizeof(int)); 390 | /////////////////////////////////// cpu arrays init starts //////////////////////////////////////// 391 | node_cnt_sum_cpu = new int[net_cnt_estimate](); 392 | int reserve_node_num = node_cnt_estimate; 393 | int biggest_depth = MAX_DEPTH; 394 | node_depth_cpu = new int[reserve_node_num](); 395 | net_depth_cpu = new int[net_cnt_estimate](); 396 | batch_depth_cnt_cpu = new int[batches.size() + 1](); 397 | depth_node_cnt_cpu = new int[biggest_depth * (batches.size() + 1)](); 398 | depth_node_cpu = new int[reserve_node_num](); 399 | nodes_cpu = new int[reserve_node_num](); 400 | node_net_idx_cpu = new int[reserve_node_num](); 401 | node_net_idx2_cpu = new int[reserve_node_num](); 402 | child_num_cpu = new int[reserve_node_num](); 403 | child_num_sum_cpu = new int[reserve_node_num](); 404 | par_num_cpu = new int[reserve_node_num](); 405 | par_num_sum_cpu = new int[reserve_node_num](); 406 | par_nodes_cpu = new int[parent_cnt_estimate](); 407 | currentChildIDX_cpu = new int[parent_cnt_estimate](); 408 | depthID2nodeSequence = new int[batches.size() * MAX_DEPTH]; 409 | /////////////////////////////////// cpu arrays init ends //////////////////////////////////////// 410 | 411 | for (int ii = batches.size() - 1; ii >= 0; ii--) 412 | { 413 | Lshape_route_detour(batches[ii]); 414 | } 415 | 416 | cudaFree(node_cnt_sum); 417 | cudaFree(par_nodes); 418 | cudaFree(nodes); 419 | cudaFree(from); 420 | cudaFree(layer_range); 421 | cudaFree(in_degree); 422 | cudaFree(currentChildIDX); 423 | cudaFree(par_num_sum); 424 | cudaFree(locks); 425 | cudaFree(layer_output); 426 | cudaFree(fixed_layers); 427 | cudaFree(node_net_idx); 428 | cudaFree(node_net_idx2); 429 | cudaFree(lock_gpu); 430 | cudaFree(node_depth); 431 | cudaFree(net_depth); 432 | cudaFree(batch_depth); 433 | cudaFree(depth_node); 434 | cudaFree(depth_node_cnt); 435 | cudaFree(childCosts); 436 | cudaFree(childCosts_road); 437 | cudaFree(best_path); 438 | delete[] congestionView_cpu; 439 | delete[] node_cnt_sum_cpu; 440 | delete[] node_depth_cpu; 441 | delete[] net_depth_cpu; 442 | delete[] batch_depth_cnt_cpu; 443 | delete[] depth_node_cnt_cpu; 444 | delete[] depth_node_cpu; 445 | delete[] nodes_cpu; 446 | delete[] node_net_idx_cpu; 447 | delete[] node_net_idx2_cpu; 448 | delete[] child_num_cpu; 449 | delete[] child_num_sum_cpu; 450 | delete[] par_num_cpu; 451 | delete[] par_num_sum_cpu; 452 | delete[] par_nodes_cpu; 453 | delete[] currentChildIDX_cpu; 454 | if(LOG) printf("[%5.1f] Lshape_route Ends\n", elapsed_time()); 455 | DAG_time = elapsed_time() - DAG_start_time; 456 | } 457 | 458 | void Lshape_route_detour(vector &nets2route) { 459 | vector> batches; 460 | batches.push_back(nets2route); 461 | vector batch_cnt_sum(batches.size() + 1, 0); 462 | for(int i = 0; i < batches.size(); i++) { 463 | batch_cnt_sum[i + 1] = batch_cnt_sum[i] + batches[i].size(); 464 | for(int j = 0; j < batches[i].size(); j++) 465 | { 466 | int net_id = batch_cnt_sum[i] + j; 467 | nets2route[net_id] = batches[i][j]; 468 | } 469 | } 470 | int net_cnt = nets2route.size(); 471 | int node_cnt = 0; 472 | int par_cnt = 0; 473 | for(auto net_id : nets2route) 474 | { 475 | node_cnt += nets[net_id].node_index_cnt; 476 | par_cnt += nets[net_id].par_num_sum_cpu[nets[net_id].node_index_cnt]; 477 | } 478 | ////////////////////////////////////////////// cpu array memset starts ////////////////////////////////////////////////// 479 | memset(node_cnt_sum_cpu, 0, (net_cnt + 1) * sizeof(int)); 480 | int batch_reserve_node = node_cnt + 10; 481 | int reserve_node_num = min(node_cnt_estimate, batch_reserve_node);//to be optimized 482 | int reserve_par_num = min(parent_cnt_estimate, par_cnt+10); 483 | memset(net_depth_cpu, 0, net_cnt * sizeof(int)); 484 | memset(batch_depth_cnt_cpu, 0, (batches.size()+1) * sizeof(int)); 485 | memset(depth_node_cnt_cpu, 0, MAX_DEPTH*(batches.size()+1) * sizeof(int)); 486 | memset(child_num_sum_cpu, 0, reserve_node_num * sizeof(int)); 487 | memset(par_num_cpu, 0, reserve_node_num * sizeof(int)); 488 | memset(par_num_sum_cpu, 0, reserve_node_num * sizeof(int)); 489 | memset(depthID2nodeSequence, 0, batches.size()*MAX_DEPTH * sizeof(int)); 490 | ////////////////////////////////////////////// cpu array memset ends ////////////////////////////////////////////////// 491 | for(int b_id=0; b_id>>(node_total*MAX_LAYER); 578 | init_min_child_costs<<<(child_num_sum_cpu[node_total]+1) * MAX_LAYER, 1>>>((child_num_sum_cpu[node_total]+1) * MAX_LAYER); 579 | init_road<<<(child_num_sum_cpu[node_total]+1) * MAX_LAYER, 1>>>((child_num_sum_cpu[node_total]+1) * MAX_LAYER); 580 | 581 | for(int i = 0; i < batches.size(); i++) { 582 | int total_node_num = batch_cnt_sum[i+1] - batch_cnt_sum[i]; 583 | int net_offset = batch_cnt_sum[i]; 584 | int next_net_offset = batch_cnt_sum[i+1]; 585 | 586 | graph::commit_wire_demand<<>> (batches[i].size(), 0, ++global_timestamp, -1); 587 | graph::commit_via_demand<<>> (batches[i].size(), 0, global_timestamp, -1); 588 | global_timestamp++; 589 | graph::update_cost(); 590 | graph::compute_presum<<>> (); 591 | cudaDeviceSynchronize(); 592 | int cur_batch_depth = batch_depth_cnt_cpu[i+1] - batch_depth_cnt_cpu[i]; 593 | for(int d = cur_batch_depth - 1; d >= 0; d--) 594 | { 595 | int shift = depthID2nodeSequence[batch_depth_cnt_cpu[i]+d]; 596 | int end_shift = depthID2nodeSequence[batch_depth_cnt_cpu[i]+d+1]; 597 | Lshape_route_node_cuda<<>> (shift, end_shift); 598 | } 599 | for(int d = 0; d < cur_batch_depth; d++) 600 | { 601 | int shift = depthID2nodeSequence[batch_depth_cnt_cpu[i]+d]; 602 | int end_shift = depthID2nodeSequence[batch_depth_cnt_cpu[i]+d+1]; 603 | get_routing_tree_cuda<<>> (shift, end_shift, d, global_timestamp); 604 | cudaDeviceSynchronize(); 605 | } 606 | graph::batch_wire_update(global_timestamp); 607 | graph::commit_via_demand<<>> (batches[i].size(), 0, global_timestamp); 608 | } 609 | } 610 | 611 | } -------------------------------------------------------------------------------- /src/flute.hpp: -------------------------------------------------------------------------------- 1 | #ifndef FLUTE_HPP_ 2 | #define FLUTE_HPP_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace flute { 11 | 12 | #define DEGREE 9 // LUT will be used when d <= DEGREE, DEGREE <= 9 13 | #define FLUTEROUTING 1 // 1 to construct routing, 0 to estimate WL only 14 | #define REMOVE_DUPLICATE_PIN 0 // Remove dup. pin for flute_wl() & flute() 15 | #define ACCURACY 8 // Default accuracy is 3 16 | // #define MAXD 2008840 // max. degree of a net that can be handled 17 | #define MAXD 10000 // max. degree of a net that can be handled 18 | 19 | using DType = int; 20 | 21 | // TODO: 22 | // 1. replace macros with constexpr variables 23 | // 2. construct a param struct 24 | // 3. replace all malloc-free contents 25 | // 4. clear inconsistent comments 26 | 27 | struct Branch { 28 | DType x, y; // starting point of the branch 29 | int n; // index of neighbor 30 | }; 31 | 32 | struct Tree { 33 | int deg; // degree 34 | DType length; // total wirelength 35 | Branch* branch; // array of tree branches 36 | }; 37 | 38 | // Major functions 39 | void readLUT(const char* powv, const char* post); 40 | Tree flute(int d, DType* x, DType* y, int acc); 41 | DType wirelength(Tree t); 42 | void printtree(Tree t); 43 | 44 | // Other useful functions 45 | Tree flutes_LD(int d, DType* xs, DType* ys, int* s); 46 | Tree flutes_MD(int d, DType* xs, DType* ys, int* s, int acc); 47 | Tree flutes_RDP(int d, DType* xs, DType* ys, int* s, int acc); 48 | 49 | #if REMOVE_DUPLICATE_PIN == 1 50 | #define flutes(d, xs, ys, s, acc) flutes_RDP(d, xs, ys, s, acc) 51 | #else 52 | #define flutes(d, xs, ys, s, acc) flutes_ALLD(d, xs, ys, s, acc) 53 | #endif 54 | 55 | #define flutes_ALLD(d, xs, ys, s, acc) flutes_LMD(d, xs, ys, s, acc) 56 | 57 | #define flutes_LMD(d, xs, ys, s, acc) \ 58 | (d <= DEGREE ? flutes_LD(d, xs, ys, s) : flutes_MD(d, xs, ys, s, acc)) 59 | 60 | 61 | #if DEGREE <= 7 62 | #define MGROUP 5040 / 4 // Max. # of groups, 7! = 5040 63 | #define MPOWV 15 // Max. # of POWVs per group 64 | #elif DEGREE == 8 65 | #define MGROUP 40320 / 4 // Max. # of groups, 8! = 40320 66 | #define MPOWV 33 // Max. # of POWVs per group 67 | #elif DEGREE == 9 68 | #define MGROUP 362880 / 4 // Max. # of groups, 9! = 362880 69 | #define MPOWV 79 // Max. # of POWVs per group 70 | #endif 71 | int numgrp[10] = {0, 0, 0, 0, 6, 30, 180, 1260, 10080, 90720}; 72 | 73 | struct Point { 74 | DType x, y; 75 | int o; 76 | }; 77 | 78 | struct csoln { 79 | unsigned char parent; 80 | unsigned char seg[12]; // Add: 0..i, Sub: j..11; seg[i+1]=seg[j-1]=0 81 | unsigned char row[DEGREE - 2], col[DEGREE - 2]; 82 | unsigned char neighbor[2 * DEGREE - 2]; 83 | }; 84 | 85 | struct csoln* LUT[DEGREE + 1][MGROUP]; // storing 4 .. D 86 | int numsoln[DEGREE + 1][MGROUP]; 87 | 88 | Tree dmergetree(Tree t1, Tree t2); 89 | Tree hmergetree(Tree t1, Tree t2, int* s); 90 | Tree vmergetree(Tree t1, Tree t2); 91 | 92 | void readLUT(const char* powv, const char* post) { 93 | static bool LUTread = false; 94 | if (LUTread) return; 95 | LUTread = true; 96 | 97 | FILE *fpwv, *fprt; 98 | struct csoln* p; 99 | int d, i, j, k, kk, ns, nn, ne; 100 | unsigned char line[99], *linep, c; 101 | unsigned char charnum[256]; 102 | unsigned char divd[256], modd[256], div16[256], mod16[256], dsq; 103 | 104 | for (i = 0; i <= 255; i++) { 105 | if ('0' <= i && i <= '9') 106 | charnum[i] = i - '0'; 107 | else if (i >= 'A') 108 | charnum[i] = i - 'A' + 10; 109 | else // if (i=='$' || i=='\n' || ... ) 110 | charnum[i] = 0; 111 | 112 | div16[i] = i / 16; 113 | mod16[i] = i % 16; 114 | } 115 | 116 | fpwv = fopen(powv, "r"); 117 | if (fpwv == nullptr) { 118 | printf("Error in opening %s\n", powv); 119 | exit(1); 120 | } 121 | 122 | #if FLUTEROUTING == 1 123 | fprt = fopen(post, "r"); 124 | if (fprt == nullptr) { 125 | printf("Error in opening %s\n", post); 126 | exit(1); 127 | } 128 | #endif 129 | 130 | for (d = 4; d <= DEGREE; d++) { 131 | for (i = 0; i <= 255; i++) { 132 | divd[i] = i / d; 133 | modd[i] = i % d; 134 | } 135 | dsq = d * d; 136 | 137 | // Return value of fscanf/fread functions. 138 | int ret = fscanf(fpwv, "d=%d\n", &d); 139 | #if FLUTEROUTING == 1 140 | ret = fscanf(fprt, "d=%d\n", &d); 141 | #endif 142 | for (k = 0; k < numgrp[d]; k++) { 143 | ns = (int)charnum[fgetc(fpwv)]; 144 | 145 | if (ns == 0) { // same as some previous group 146 | ret = fscanf(fpwv, "%d\n", &kk); 147 | numsoln[d][k] = numsoln[d][kk]; 148 | LUT[d][k] = LUT[d][kk]; 149 | } else { 150 | fgetc(fpwv); // '\n' 151 | numsoln[d][k] = ns; 152 | p = (struct csoln*)malloc(ns * sizeof(struct csoln)); 153 | LUT[d][k] = p; 154 | for (i = 1; i <= ns; i++) { 155 | linep = (unsigned char*)fgets((char*)line, 99, fpwv); 156 | p->parent = charnum[*(linep++)]; 157 | j = 0; 158 | while ((p->seg[j++] = charnum[*(linep++)]) != 0) 159 | ; 160 | j = 10; 161 | while ((p->seg[j--] = charnum[*(linep++)]) != 0) 162 | ; 163 | #if FLUTEROUTING == 1 164 | nn = 2 * d - 2; 165 | ne = 2 * d - 3; 166 | ret = fread(line, 1, d - 2, fprt); 167 | linep = line; 168 | for (j = d; j < nn; j++) { 169 | c = *(linep++); 170 | if (c >= dsq) { 171 | c -= dsq; 172 | p->neighbor[divd[c]] = j; 173 | ne--; 174 | } 175 | p->row[j - d] = divd[c]; 176 | p->col[j - d] = modd[c]; 177 | p->neighbor[j] = j; // initialized 178 | } 179 | ret = fread(line, 1, ne + 1, fprt); 180 | linep = line; // last char = \n 181 | for (j = 0; j < ne; j++) { 182 | c = *(linep++); 183 | p->neighbor[div16[c]] = mod16[c]; 184 | } 185 | #endif 186 | p++; 187 | } 188 | } 189 | } 190 | } 191 | } 192 | 193 | // For medium-degree, i.e., D+1 <= d 194 | 195 | Tree flute(int d, DType* x, DType* y, int acc) { 196 | auto allocateSize = std::max(MAXD, d + 1); 197 | DType* xs = (DType*)malloc(sizeof(DType) * allocateSize); 198 | DType* ys = (DType*)malloc(sizeof(DType) * allocateSize); 199 | int* s = (int*)malloc(sizeof(int) * allocateSize); 200 | Point* pt = (Point*)malloc(sizeof(Point) * allocateSize); 201 | Point** ptp = (Point**)malloc(sizeof(Point*) * allocateSize); 202 | 203 | Point* tmpp; 204 | DType minval; 205 | int i, j, minidx; 206 | Tree t; 207 | 208 | if (d == 2) { 209 | t.deg = 2; 210 | t.length = std::abs(x[0] - x[1]) + std::abs(y[0] - y[1]); 211 | t.branch = (Branch*)malloc(2 * sizeof(Branch)); 212 | t.branch[0].x = x[0]; 213 | t.branch[0].y = y[0]; 214 | t.branch[0].n = 1; 215 | t.branch[1].x = x[1]; 216 | t.branch[1].y = y[1]; 217 | t.branch[1].n = 1; 218 | } else { 219 | for (i = 0; i < d; i++) { 220 | pt[i].x = x[i]; 221 | pt[i].y = y[i]; 222 | ptp[i] = &pt[i]; 223 | } 224 | 225 | // sort x 226 | for (i = 0; i < d - 1; i++) { 227 | minval = ptp[i]->x; 228 | minidx = i; 229 | for (j = i + 1; j < d; j++) { 230 | if (minval > ptp[j]->x || (minval == ptp[j]->x && ptp[minidx]->y < ptp[j]->y)) { 231 | minval = ptp[j]->x; 232 | minidx = j; 233 | } 234 | } 235 | tmpp = ptp[i]; 236 | ptp[i] = ptp[minidx]; 237 | ptp[minidx] = tmpp; 238 | } 239 | 240 | #if REMOVE_DUPLICATE_PIN == 1 241 | ptp[d] = &pt[d]; 242 | ptp[d]->x = ptp[d]->y = -999999; 243 | j = 0; 244 | for (i = 0; i < d; i++) { 245 | for (k = i + 1; ptp[k]->x == ptp[i]->x; k++) 246 | if (ptp[k]->y == ptp[i]->y) // pins k and i are the same 247 | break; 248 | if (ptp[k]->x != ptp[i]->x) 249 | ptp[j++] = ptp[i]; 250 | } 251 | d = j; 252 | #endif 253 | 254 | for (i = 0; i < d; i++) { 255 | xs[i] = ptp[i]->x; 256 | ptp[i]->o = i; 257 | } 258 | 259 | // sort y to find s[] 260 | for (i = 0; i < d - 1; i++) { 261 | minval = ptp[i]->y; 262 | minidx = i; 263 | for (j = i + 1; j < d; j++) { 264 | if (minval > ptp[j]->y) { 265 | minval = ptp[j]->y; 266 | minidx = j; 267 | } 268 | } 269 | ys[i] = ptp[minidx]->y; 270 | s[i] = ptp[minidx]->o; 271 | ptp[minidx] = ptp[i]; 272 | } 273 | ys[d - 1] = ptp[d - 1]->y; 274 | s[d - 1] = ptp[d - 1]->o; 275 | 276 | t = flutes(d, xs, ys, s, acc); 277 | } 278 | free(xs); 279 | free(ys); 280 | free(s); 281 | free(pt); 282 | free(ptp); 283 | return t; 284 | } 285 | 286 | // xs[] and ys[] are coords in x and y in sorted order 287 | // s[] is a list of nodes in increasing y direction 288 | // if nodes are indexed in the order of increasing x coord 289 | // i.e., s[i] = s_i in defined in paper 290 | // The Points are (xs[s[i]], ys[i]) for i=0..d-1 291 | // or (xs[i], ys[si[i]]) for i=0..d-1 292 | 293 | Tree flutes_RDP(int d, DType* xs, DType* ys, int* s, int acc) { 294 | int i, j, ss; 295 | 296 | for (i = 0; i < d - 1; i++) { 297 | if (xs[s[i]] == xs[s[i + 1]] && ys[i] == ys[i + 1]) { 298 | if (s[i] < s[i + 1]) 299 | ss = s[i + 1]; 300 | else { 301 | ss = s[i]; 302 | s[i] = s[i + 1]; 303 | } 304 | for (j = i + 2; j < d; j++) { 305 | ys[j - 1] = ys[j]; 306 | s[j - 1] = s[j]; 307 | } 308 | for (j = ss + 1; j < d; j++) 309 | xs[j - 1] = xs[j]; 310 | for (j = 0; j <= d - 2; j++) 311 | if (s[j] > ss) s[j]--; 312 | i--; 313 | d--; 314 | } 315 | } 316 | return flutes_ALLD(d, xs, ys, s, acc); 317 | } 318 | 319 | // For low-degree, i.e., 2 <= d <= D 320 | Tree flutes_LD(int d, DType* xs, DType* ys, int* s) { 321 | int k, pi, i, j; 322 | struct csoln *rlist, *bestrlist; 323 | DType dd[2 * DEGREE - 2]; // 0..D-2 for v, D-1..2*D-3 for h 324 | DType minl, sum; 325 | DType l[MPOWV + 1]; 326 | int hflip; 327 | Tree t; 328 | 329 | t.deg = d; 330 | t.branch = (Branch*)malloc((2 * d - 2) * sizeof(Branch)); 331 | if (d == 2) { 332 | minl = xs[1] - xs[0] + ys[1] - ys[0]; 333 | t.branch[0].x = xs[s[0]]; 334 | t.branch[0].y = ys[0]; 335 | t.branch[0].n = 1; 336 | t.branch[1].x = xs[s[1]]; 337 | t.branch[1].y = ys[1]; 338 | t.branch[1].n = 1; 339 | } else if (d == 3) { 340 | minl = xs[2] - xs[0] + ys[2] - ys[0]; 341 | t.branch[0].x = xs[s[0]]; 342 | t.branch[0].y = ys[0]; 343 | t.branch[0].n = 3; 344 | t.branch[1].x = xs[s[1]]; 345 | t.branch[1].y = ys[1]; 346 | t.branch[1].n = 3; 347 | t.branch[2].x = xs[s[2]]; 348 | t.branch[2].y = ys[2]; 349 | t.branch[2].n = 3; 350 | t.branch[3].x = xs[1]; 351 | t.branch[3].y = ys[1]; 352 | t.branch[3].n = 3; 353 | } else { 354 | k = 0; 355 | if (s[0] < s[2]) k++; 356 | if (s[1] < s[2]) k++; 357 | 358 | for (i = 3; i <= d - 1; i++) { // p0=0 always, skip i=1 for symmetry 359 | pi = s[i]; 360 | for (j = d - 1; j > i; j--) 361 | if (s[j] < s[i]) 362 | pi--; 363 | k = pi + (i + 1) * k; 364 | } 365 | 366 | if (k < numgrp[d]) { // no horizontal flip 367 | hflip = 0; 368 | for (i = 1; i <= d - 3; i++) { 369 | dd[i] = ys[i + 1] - ys[i]; 370 | dd[d - 1 + i] = xs[i + 1] - xs[i]; 371 | } 372 | } else { 373 | hflip = 1; 374 | k = 2 * numgrp[d] - 1 - k; 375 | for (i = 1; i <= d - 3; i++) { 376 | dd[i] = ys[i + 1] - ys[i]; 377 | dd[d - 1 + i] = xs[d - 1 - i] - xs[d - 2 - i]; 378 | } 379 | } 380 | 381 | minl = l[0] = xs[d - 1] - xs[0] + ys[d - 1] - ys[0]; 382 | rlist = LUT[d][k]; 383 | for (i = 0; rlist->seg[i] > 0; i++) 384 | minl += dd[rlist->seg[i]]; 385 | bestrlist = rlist; 386 | l[1] = minl; 387 | j = 2; 388 | while (j <= numsoln[d][k]) { 389 | rlist++; 390 | sum = l[rlist->parent]; 391 | for (i = 0; rlist->seg[i] > 0; i++) 392 | sum += dd[rlist->seg[i]]; 393 | for (i = 10; rlist->seg[i] > 0; i--) 394 | sum -= dd[rlist->seg[i]]; 395 | if (sum < minl) { 396 | minl = sum; 397 | bestrlist = rlist; 398 | } 399 | l[j++] = sum; 400 | } 401 | 402 | t.branch[0].x = xs[s[0]]; 403 | t.branch[0].y = ys[0]; 404 | t.branch[1].x = xs[s[1]]; 405 | t.branch[1].y = ys[1]; 406 | for (i = 2; i < d - 2; i++) { 407 | t.branch[i].x = xs[s[i]]; 408 | t.branch[i].y = ys[i]; 409 | t.branch[i].n = bestrlist->neighbor[i]; 410 | } 411 | t.branch[d - 2].x = xs[s[d - 2]]; 412 | t.branch[d - 2].y = ys[d - 2]; 413 | t.branch[d - 1].x = xs[s[d - 1]]; 414 | t.branch[d - 1].y = ys[d - 1]; 415 | if (hflip) { 416 | if (s[1] < s[0]) { 417 | t.branch[0].n = bestrlist->neighbor[1]; 418 | t.branch[1].n = bestrlist->neighbor[0]; 419 | } else { 420 | t.branch[0].n = bestrlist->neighbor[0]; 421 | t.branch[1].n = bestrlist->neighbor[1]; 422 | } 423 | if (s[d - 1] < s[d - 2]) { 424 | t.branch[d - 2].n = bestrlist->neighbor[d - 1]; 425 | t.branch[d - 1].n = bestrlist->neighbor[d - 2]; 426 | } else { 427 | t.branch[d - 2].n = bestrlist->neighbor[d - 2]; 428 | t.branch[d - 1].n = bestrlist->neighbor[d - 1]; 429 | } 430 | for (i = d; i < 2 * d - 2; i++) { 431 | t.branch[i].x = xs[d - 1 - bestrlist->col[i - d]]; 432 | t.branch[i].y = ys[bestrlist->row[i - d]]; 433 | t.branch[i].n = bestrlist->neighbor[i]; 434 | } 435 | } else { // !hflip 436 | if (s[0] < s[1]) { 437 | t.branch[0].n = bestrlist->neighbor[1]; 438 | t.branch[1].n = bestrlist->neighbor[0]; 439 | } else { 440 | t.branch[0].n = bestrlist->neighbor[0]; 441 | t.branch[1].n = bestrlist->neighbor[1]; 442 | } 443 | if (s[d - 2] < s[d - 1]) { 444 | t.branch[d - 2].n = bestrlist->neighbor[d - 1]; 445 | t.branch[d - 1].n = bestrlist->neighbor[d - 2]; 446 | } else { 447 | t.branch[d - 2].n = bestrlist->neighbor[d - 2]; 448 | t.branch[d - 1].n = bestrlist->neighbor[d - 1]; 449 | } 450 | for (i = d; i < 2 * d - 2; i++) { 451 | t.branch[i].x = xs[bestrlist->col[i - d]]; 452 | t.branch[i].y = ys[bestrlist->row[i - d]]; 453 | t.branch[i].n = bestrlist->neighbor[i]; 454 | } 455 | } 456 | } 457 | t.length = minl; 458 | 459 | return t; 460 | } 461 | 462 | // For medium-degree, i.e., D+1 <= d <= D2 463 | Tree flutes_MD(int d, DType* xs, DType* ys, int* s, int acc) { 464 | auto allocateSize = std::max(MAXD, d + 1); 465 | DType* x1 = (DType*)malloc(sizeof(DType) * allocateSize); 466 | DType* x2 = (DType*)malloc(sizeof(DType) * allocateSize); 467 | DType* y1 = (DType*)malloc(sizeof(DType) * allocateSize); 468 | DType* y2 = (DType*)malloc(sizeof(DType) * allocateSize); 469 | int* si = (int*)malloc(sizeof(int) * allocateSize); 470 | int* s1 = (int*)malloc(sizeof(int) * allocateSize); 471 | int* s2 = (int*)malloc(sizeof(int) * allocateSize); 472 | float* score = (float*)malloc(sizeof(float) * 2 * allocateSize); 473 | float* penalty = (float*)malloc(sizeof(float) * allocateSize); 474 | DType* distx = (DType*)malloc(sizeof(DType) * allocateSize); 475 | DType* disty = (DType*)malloc(sizeof(DType) * allocateSize); 476 | 477 | // DType x1[MAXD], x2[MAXD], y1[MAXD], y2[MAXD]; 478 | // int si[MAXD], s1[MAXD], s2[MAXD]; 479 | // float score[2*MAXD], penalty[MAXD]; 480 | // DType distx[MAXD], disty[MAXD]; 481 | float pnlty, dx, dy; 482 | DType ll, minl, coord1, coord2; 483 | int i, r, p, maxbp, bestbp, bp, nbp, ub, lb, n1, n2, nn1, nn2, ms, newacc; 484 | Tree t, t1, t2, bestt1, bestt2; 485 | int mins, maxs, minsi, maxsi; 486 | DType xydiff; 487 | 488 | for (i = 0; i < allocateSize; ++i) { 489 | distx[i] = disty[i] = 0; 490 | } 491 | 492 | if (s[0] < s[d - 1]) { 493 | ms = std::max(s[0], s[1]); 494 | for (i = 2; i <= ms; i++) 495 | ms = std::max(ms, s[i]); 496 | if (ms <= d - 3) { 497 | for (i = 0; i <= ms; i++) { 498 | x1[i] = xs[i]; 499 | y1[i] = ys[i]; 500 | s1[i] = s[i]; 501 | } 502 | x1[ms + 1] = xs[ms]; 503 | y1[ms + 1] = ys[ms]; 504 | s1[ms + 1] = ms + 1; 505 | 506 | s2[0] = 0; 507 | for (i = 1; i <= d - 1 - ms; i++) 508 | s2[i] = s[i + ms] - ms; 509 | 510 | t1 = flutes_LMD(ms + 2, x1, y1, s1, acc); 511 | t2 = flutes_LMD(d - ms, xs + ms, ys + ms, s2, acc); 512 | t = dmergetree(t1, t2); 513 | free(t1.branch); 514 | free(t2.branch); 515 | 516 | free(x1); 517 | free(x2); 518 | free(y1); 519 | free(y2); 520 | free(si); 521 | free(s1); 522 | free(s2); 523 | free(score); 524 | free(penalty); 525 | free(distx); 526 | free(disty); 527 | return t; 528 | } 529 | } else { // (s[0] > s[d-1]) 530 | ms = std::min(s[0], s[1]); 531 | for (i = 2; i <= d - 1 - ms; i++) 532 | ms = std::min(ms, s[i]); 533 | if (ms >= 2) { 534 | x1[0] = xs[ms]; 535 | y1[0] = ys[0]; 536 | s1[0] = s[0] - ms + 1; 537 | for (i = 1; i <= d - 1 - ms; i++) { 538 | x1[i] = xs[i + ms - 1]; 539 | y1[i] = ys[i]; 540 | s1[i] = s[i] - ms + 1; 541 | } 542 | x1[d - ms] = xs[d - 1]; 543 | y1[d - ms] = ys[d - 1 - ms]; 544 | s1[d - ms] = 0; 545 | 546 | s2[0] = ms; 547 | for (i = 1; i <= ms; i++) 548 | s2[i] = s[i + d - 1 - ms]; 549 | 550 | t1 = flutes_LMD(d + 1 - ms, x1, y1, s1, acc); 551 | t2 = flutes_LMD(ms + 1, xs, ys + d - 1 - ms, s2, acc); 552 | t = dmergetree(t1, t2); 553 | free(t1.branch); 554 | free(t2.branch); 555 | 556 | free(x1); 557 | free(x2); 558 | free(y1); 559 | free(y2); 560 | free(si); 561 | free(s1); 562 | free(s2); 563 | free(score); 564 | free(penalty); 565 | free(distx); 566 | free(disty); 567 | return t; 568 | } 569 | } 570 | 571 | // Find inverse si[] of s[] 572 | for (r = 0; r < d; r++) 573 | si[s[r]] = r; 574 | 575 | // Determine breaking directions and positions dp[] 576 | lb = std::max((d - acc) / 5, 2); 577 | ub = d - 1 - lb; 578 | 579 | // Compute scores 580 | #define AA 0.6 // 2.0*BB 581 | #define BB 0.3 582 | // #define CCC std::max(0.425-0.005*d-0.015*acc, 0.1) 583 | // #define CCC std::max(0.43-0.005*d-0.01*acc, 0.1) 584 | #define CCC std::max(0.41 - 0.005 * d, 0.1) 585 | #define DD 4.8 586 | float DDD = DD / (d - 1); 587 | 588 | // Compute penalty[] 589 | dx = CCC * (xs[d - 2] - xs[1]) / (d - 3); 590 | dy = CCC * (ys[d - 2] - ys[1]) / (d - 3); 591 | for (r = d / 2, pnlty = 0; r >= 2; r--, pnlty += dx) 592 | penalty[r] = pnlty, penalty[d - 1 - r] = pnlty; 593 | penalty[1] = pnlty, penalty[d - 2] = pnlty; 594 | penalty[0] = pnlty, penalty[d - 1] = pnlty; 595 | for (r = d / 2 - 1, pnlty = dy; r >= 2; r--, pnlty += dy) 596 | penalty[s[r]] += pnlty, penalty[s[d - 1 - r]] += pnlty; 597 | penalty[s[1]] += pnlty, penalty[s[d - 2]] += pnlty; 598 | penalty[s[0]] += pnlty, penalty[s[d - 1]] += pnlty; 599 | // #define CC 0.16 600 | // #define v(r) ((r==0||r==1||r==d-2||r==d-1) ? d-3 : abs(d-1-r-r)) 601 | // for (r=0; r maxs) 618 | maxs = s[r]; 619 | distx[r] = xs[maxs] - xs[mins]; 620 | if (si[r] < minsi) 621 | minsi = si[r]; 622 | else if (si[r] > maxsi) 623 | maxsi = si[r]; 624 | disty[r] = ys[maxsi] - ys[minsi] + xydiff; 625 | } 626 | 627 | if (s[d - 2] < s[d - 1]) 628 | mins = s[d - 2], maxs = s[d - 1]; 629 | else 630 | mins = s[d - 1], maxs = s[d - 2]; 631 | if (si[d - 2] < si[d - 1]) 632 | minsi = si[d - 2], maxsi = si[d - 1]; 633 | else 634 | minsi = si[d - 1], maxsi = si[d - 2]; 635 | for (r = d - 3; r >= lb; r--) { 636 | if (s[r] < mins) 637 | mins = s[r]; 638 | else if (s[r] > maxs) 639 | maxs = s[r]; 640 | distx[r] += xs[maxs] - xs[mins]; 641 | if (si[r] < minsi) 642 | minsi = si[r]; 643 | else if (si[r] > maxsi) 644 | maxsi = si[r]; 645 | disty[r] += ys[maxsi] - ys[minsi]; 646 | } 647 | 648 | nbp = 0; 649 | for (r = lb; r <= ub; r++) { 650 | if (si[r] <= 1) 651 | score[nbp] = (xs[r + 1] - xs[r - 1]) - penalty[r] - AA * (ys[2] - ys[1]) - DDD * disty[r]; 652 | else if (si[r] >= d - 2) 653 | score[nbp] = (xs[r + 1] - xs[r - 1]) - penalty[r] - AA * (ys[d - 2] - ys[d - 3]) - DDD * disty[r]; 654 | else 655 | score[nbp] = (xs[r + 1] - xs[r - 1]) - penalty[r] - BB * (ys[si[r] + 1] - ys[si[r] - 1]) - DDD * disty[r]; 656 | nbp++; 657 | 658 | if (s[r] <= 1) 659 | score[nbp] = (ys[r + 1] - ys[r - 1]) - penalty[s[r]] - AA * (xs[2] - xs[1]) - DDD * distx[r]; 660 | else if (s[r] >= d - 2) 661 | score[nbp] = (ys[r + 1] - ys[r - 1]) - penalty[s[r]] - AA * (xs[d - 2] - xs[d - 3]) - DDD * distx[r]; 662 | else 663 | score[nbp] = (ys[r + 1] - ys[r - 1]) - penalty[s[r]] - BB * (xs[s[r] + 1] - xs[s[r] - 1]) - DDD * distx[r]; 664 | nbp++; 665 | } 666 | 667 | if (acc <= 3) 668 | newacc = 1; 669 | else { 670 | newacc = acc / 2; 671 | if (acc >= nbp) acc = nbp - 1; 672 | } 673 | 674 | minl = (DType)INT_MAX; 675 | bestt1.branch = bestt2.branch = nullptr; 676 | for (i = 0; i < acc; i++) { 677 | maxbp = 0; 678 | for (bp = 1; bp < nbp; bp++) 679 | if (score[maxbp] < score[bp]) maxbp = bp; 680 | score[maxbp] = -9e9f; 681 | 682 | #define BreakPt(bp) ((bp) / 2 + lb) 683 | #define BreakInX(bp) ((bp) % 2 == 0) 684 | p = BreakPt(maxbp); 685 | // Breaking in p 686 | if (BreakInX(maxbp)) { // break in x 687 | n1 = n2 = 0; 688 | for (r = 0; r < d; r++) { 689 | if (s[r] < p) { 690 | s1[n1] = s[r]; 691 | y1[n1] = ys[r]; 692 | n1++; 693 | } else if (s[r] > p) { 694 | s2[n2] = s[r] - p; 695 | y2[n2] = ys[r]; 696 | n2++; 697 | } else { // if (s[r] == p) i.e., r = si[p] 698 | s1[n1] = p; 699 | s2[n2] = 0; 700 | y1[n1] = y2[n2] = ys[r]; 701 | nn1 = n1; 702 | nn2 = n2; 703 | n1++; 704 | n2++; 705 | } 706 | } 707 | 708 | t1 = flutes_LMD(p + 1, xs, y1, s1, newacc); 709 | t2 = flutes_LMD(d - p, xs + p, y2, s2, newacc); 710 | ll = t1.length + t2.length; 711 | coord1 = t1.branch[t1.branch[nn1].n].y; 712 | coord2 = t2.branch[t2.branch[nn2].n].y; 713 | if (t2.branch[nn2].y > std::max(coord1, coord2)) 714 | ll -= t2.branch[nn2].y - std::max(coord1, coord2); 715 | else if (t2.branch[nn2].y < std::min(coord1, coord2)) 716 | ll -= std::min(coord1, coord2) - t2.branch[nn2].y; 717 | } else { // if (!BreakInX(maxbp)) 718 | n1 = n2 = 0; 719 | for (r = 0; r < d; r++) { 720 | if (si[r] < p) { 721 | s1[si[r]] = n1; 722 | x1[n1] = xs[r]; 723 | n1++; 724 | } else if (si[r] > p) { 725 | s2[si[r] - p] = n2; 726 | x2[n2] = xs[r]; 727 | n2++; 728 | } else { // if (si[r] == p) i.e., r = s[p] 729 | s1[p] = n1; 730 | s2[0] = n2; 731 | x1[n1] = x2[n2] = xs[r]; 732 | n1++; 733 | n2++; 734 | } 735 | } 736 | 737 | t1 = flutes_LMD(p + 1, x1, ys, s1, newacc); 738 | t2 = flutes_LMD(d - p, x2, ys + p, s2, newacc); 739 | ll = t1.length + t2.length; 740 | coord1 = t1.branch[t1.branch[p].n].x; 741 | coord2 = t2.branch[t2.branch[0].n].x; 742 | if (t2.branch[0].x > std::max(coord1, coord2)) 743 | ll -= t2.branch[0].x - std::max(coord1, coord2); 744 | else if (t2.branch[0].x < std::min(coord1, coord2)) 745 | ll -= std::min(coord1, coord2) - t2.branch[0].x; 746 | } 747 | if (minl > ll) { 748 | minl = ll; 749 | free(bestt1.branch); 750 | free(bestt2.branch); 751 | bestt1 = t1; 752 | bestt2 = t2; 753 | bestbp = maxbp; 754 | } else { 755 | free(t1.branch); 756 | free(t2.branch); 757 | } 758 | } 759 | 760 | if (BreakInX(bestbp)) 761 | t = hmergetree(bestt1, bestt2, s); 762 | else 763 | t = vmergetree(bestt1, bestt2); 764 | free(bestt1.branch); 765 | free(bestt2.branch); 766 | 767 | free(x1); 768 | free(x2); 769 | free(y1); 770 | free(y2); 771 | free(si); 772 | free(s1); 773 | free(s2); 774 | free(score); 775 | free(penalty); 776 | free(distx); 777 | free(disty); 778 | return t; 779 | } 780 | 781 | Tree dmergetree(Tree t1, Tree t2) { 782 | int i, d, prev, curr, next, offset1, offset2; 783 | Tree t; 784 | 785 | t.deg = d = t1.deg + t2.deg - 2; 786 | t.length = t1.length + t2.length; 787 | t.branch = (Branch*)malloc((2 * d - 2) * sizeof(Branch)); 788 | offset1 = t2.deg - 2; 789 | offset2 = 2 * t1.deg - 4; 790 | 791 | for (i = 0; i <= t1.deg - 2; i++) { 792 | t.branch[i].x = t1.branch[i].x; 793 | t.branch[i].y = t1.branch[i].y; 794 | t.branch[i].n = t1.branch[i].n + offset1; 795 | } 796 | for (i = t1.deg - 1; i <= d - 1; i++) { 797 | t.branch[i].x = t2.branch[i - t1.deg + 2].x; 798 | t.branch[i].y = t2.branch[i - t1.deg + 2].y; 799 | t.branch[i].n = t2.branch[i - t1.deg + 2].n + offset2; 800 | } 801 | for (i = d; i <= d + t1.deg - 3; i++) { 802 | t.branch[i].x = t1.branch[i - offset1].x; 803 | t.branch[i].y = t1.branch[i - offset1].y; 804 | t.branch[i].n = t1.branch[i - offset1].n + offset1; 805 | } 806 | for (i = d + t1.deg - 2; i <= 2 * d - 3; i++) { 807 | t.branch[i].x = t2.branch[i - offset2].x; 808 | t.branch[i].y = t2.branch[i - offset2].y; 809 | t.branch[i].n = t2.branch[i - offset2].n + offset2; 810 | } 811 | 812 | prev = t2.branch[0].n + offset2; 813 | curr = t1.branch[t1.deg - 1].n + offset1; 814 | next = t.branch[curr].n; 815 | while (curr != next) { 816 | t.branch[curr].n = prev; 817 | prev = curr; 818 | curr = next; 819 | next = t.branch[curr].n; 820 | } 821 | t.branch[curr].n = prev; 822 | 823 | return t; 824 | } 825 | 826 | Tree hmergetree(Tree t1, Tree t2, int* s) { 827 | int i, prev, curr, next, extra, offset1, offset2; 828 | int p, ii, n1, n2, nn1, nn2; 829 | DType coord1, coord2; 830 | Tree t; 831 | 832 | t.deg = t1.deg + t2.deg - 1; 833 | t.length = t1.length + t2.length; 834 | t.branch = (Branch*)malloc((2 * t.deg - 2) * sizeof(Branch)); 835 | offset1 = t2.deg - 1; 836 | offset2 = 2 * t1.deg - 3; 837 | 838 | p = t1.deg - 1; 839 | n1 = n2 = 0; 840 | for (i = 0; i < t.deg; i++) { 841 | if (s[i] < p) { 842 | t.branch[i].x = t1.branch[n1].x; 843 | t.branch[i].y = t1.branch[n1].y; 844 | t.branch[i].n = t1.branch[n1].n + offset1; 845 | n1++; 846 | } else if (s[i] > p) { 847 | t.branch[i].x = t2.branch[n2].x; 848 | t.branch[i].y = t2.branch[n2].y; 849 | t.branch[i].n = t2.branch[n2].n + offset2; 850 | n2++; 851 | } else { 852 | t.branch[i].x = t2.branch[n2].x; 853 | t.branch[i].y = t2.branch[n2].y; 854 | t.branch[i].n = t2.branch[n2].n + offset2; 855 | nn1 = n1; 856 | nn2 = n2; 857 | ii = i; 858 | n1++; 859 | n2++; 860 | } 861 | } 862 | for (i = t.deg; i <= t.deg + t1.deg - 3; i++) { 863 | t.branch[i].x = t1.branch[i - offset1].x; 864 | t.branch[i].y = t1.branch[i - offset1].y; 865 | t.branch[i].n = t1.branch[i - offset1].n + offset1; 866 | } 867 | for (i = t.deg + t1.deg - 2; i <= 2 * t.deg - 4; i++) { 868 | t.branch[i].x = t2.branch[i - offset2].x; 869 | t.branch[i].y = t2.branch[i - offset2].y; 870 | t.branch[i].n = t2.branch[i - offset2].n + offset2; 871 | } 872 | extra = 2 * t.deg - 3; 873 | coord1 = t1.branch[t1.branch[nn1].n].y; 874 | coord2 = t2.branch[t2.branch[nn2].n].y; 875 | if (t2.branch[nn2].y > std::max(coord1, coord2)) { 876 | t.branch[extra].y = std::max(coord1, coord2); 877 | t.length -= t2.branch[nn2].y - t.branch[extra].y; 878 | } else if (t2.branch[nn2].y < std::min(coord1, coord2)) { 879 | t.branch[extra].y = std::min(coord1, coord2); 880 | t.length -= t.branch[extra].y - t2.branch[nn2].y; 881 | } else 882 | t.branch[extra].y = t2.branch[nn2].y; 883 | t.branch[extra].x = t2.branch[nn2].x; 884 | t.branch[extra].n = t.branch[ii].n; 885 | t.branch[ii].n = extra; 886 | 887 | prev = extra; 888 | curr = t1.branch[nn1].n + offset1; 889 | next = t.branch[curr].n; 890 | while (curr != next) { 891 | t.branch[curr].n = prev; 892 | prev = curr; 893 | curr = next; 894 | next = t.branch[curr].n; 895 | } 896 | t.branch[curr].n = prev; 897 | 898 | return t; 899 | } 900 | 901 | Tree vmergetree(Tree t1, Tree t2) { 902 | int i, prev, curr, next, extra, offset1, offset2; 903 | DType coord1, coord2; 904 | Tree t; 905 | 906 | t.deg = t1.deg + t2.deg - 1; 907 | t.length = t1.length + t2.length; 908 | t.branch = (Branch*)malloc((2 * t.deg - 2) * sizeof(Branch)); 909 | offset1 = t2.deg - 1; 910 | offset2 = 2 * t1.deg - 3; 911 | 912 | for (i = 0; i <= t1.deg - 2; i++) { 913 | t.branch[i].x = t1.branch[i].x; 914 | t.branch[i].y = t1.branch[i].y; 915 | t.branch[i].n = t1.branch[i].n + offset1; 916 | } 917 | for (i = t1.deg - 1; i <= t.deg - 1; i++) { 918 | t.branch[i].x = t2.branch[i - t1.deg + 1].x; 919 | t.branch[i].y = t2.branch[i - t1.deg + 1].y; 920 | t.branch[i].n = t2.branch[i - t1.deg + 1].n + offset2; 921 | } 922 | for (i = t.deg; i <= t.deg + t1.deg - 3; i++) { 923 | t.branch[i].x = t1.branch[i - offset1].x; 924 | t.branch[i].y = t1.branch[i - offset1].y; 925 | t.branch[i].n = t1.branch[i - offset1].n + offset1; 926 | } 927 | for (i = t.deg + t1.deg - 2; i <= 2 * t.deg - 4; i++) { 928 | t.branch[i].x = t2.branch[i - offset2].x; 929 | t.branch[i].y = t2.branch[i - offset2].y; 930 | t.branch[i].n = t2.branch[i - offset2].n + offset2; 931 | } 932 | extra = 2 * t.deg - 3; 933 | coord1 = t1.branch[t1.branch[t1.deg - 1].n].x; 934 | coord2 = t2.branch[t2.branch[0].n].x; 935 | if (t2.branch[0].x > std::max(coord1, coord2)) { 936 | t.branch[extra].x = std::max(coord1, coord2); 937 | t.length -= t2.branch[0].x - t.branch[extra].x; 938 | } else if (t2.branch[0].x < std::min(coord1, coord2)) { 939 | t.branch[extra].x = std::min(coord1, coord2); 940 | t.length -= t.branch[extra].x - t2.branch[0].x; 941 | } else 942 | t.branch[extra].x = t2.branch[0].x; 943 | t.branch[extra].y = t2.branch[0].y; 944 | t.branch[extra].n = t.branch[t1.deg - 1].n; 945 | t.branch[t1.deg - 1].n = extra; 946 | 947 | prev = extra; 948 | curr = t1.branch[t1.deg - 1].n + offset1; 949 | next = t.branch[curr].n; 950 | while (curr != next) { 951 | t.branch[curr].n = prev; 952 | prev = curr; 953 | curr = next; 954 | next = t.branch[curr].n; 955 | } 956 | t.branch[curr].n = prev; 957 | 958 | return t; 959 | } 960 | 961 | DType wirelength(Tree t) { 962 | int i, j; 963 | DType l = 0; 964 | 965 | for (i = 0; i < 2 * t.deg - 2; i++) { 966 | j = t.branch[i].n; 967 | l += std::abs(t.branch[i].x - t.branch[j].x) + 968 | std::abs(t.branch[i].y - t.branch[j].y); 969 | } 970 | 971 | return l; 972 | } 973 | 974 | void printtree(Tree t) { 975 | int i; 976 | 977 | for (i = 0; i < t.deg; i++) 978 | printf(" %-2d: x=%4g y=%4g e=%d\n", 979 | i, (float)t.branch[i].x, (float)t.branch[i].y, t.branch[i].n); 980 | for (i = t.deg; i < 2 * t.deg - 2; i++) 981 | printf("s%-2d: x=%4g y=%4g e=%d\n", 982 | i, (float)t.branch[i].x, (float)t.branch[i].y, t.branch[i].n); 983 | printf("\n"); 984 | } 985 | 986 | } // namespace flute 987 | 988 | #endif // FLUTE_HPP_ -------------------------------------------------------------------------------- /src/database_cuda.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "database.hpp" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace cudb { 11 | 12 | __managed__ int *rip_up_list; 13 | __managed__ bool *congestion; 14 | __managed__ float *congestion_xsum; 15 | __managed__ float *congestion_ysum; 16 | std::mutex mtx; 17 | 18 | __managed__ unsigned long long ROUTE_PER_PIN = 27; 19 | 20 | __managed__ int DIR, L, X, Y, XY, NET_NUM, PIN_NUM, *x_edge_len, *y_edge_len, *pin_acc_num, *pins;//DIR: direction of layer 0 in cuda database 21 | __managed__ double unit_length_wire_cost, unit_via_cost, *unit_length_short_costs, *layer_min_len; 22 | __managed__ float *vcost, *wcost, *capacity, *demand; 23 | __managed__ double total_wirelength = 0, total_overflow_cost, layer_overflow_cost[10]; 24 | __managed__ int total_via_count = 0; 25 | 26 | __managed__ bool *is_of_net; 27 | __managed__ int *of_edge_sum; 28 | __managed__ int *routes, *timestamp, *pre_demand; 29 | __managed__ int *last, all_track_cnt, *idx2track, *track_net, *track_pos, *track_xy; 30 | __managed__ double *presum; 31 | __managed__ int *net_ids; 32 | int *net_ids_cpu; 33 | int global_timestamp = 0; 34 | vector> net_x_cpu, net_y_cpu; 35 | vector pin_cnt_sum_cpu; 36 | 37 | #define IDX(l, x, y) ((l) * X * Y + (x) * Y + (y)) 38 | #define THREAD_NUM 512 39 | #define BLOCK_NUM(n) ((n) / THREAD_NUM + 1) 40 | #define BLOCK_CNT(tot, thread_cnt) ((tot) / (thread_cnt) + ((tot) % (thread_cnt) > 0)) 41 | #define INF 1e22 42 | 43 | 44 | void build_cuda_database(); 45 | 46 | struct net { 47 | void construct_rsmt(); 48 | void generate_detours(bool *congestionView_cpu, float *congestionView_xsum_cpu, float *congestionView_ysum_cpu, 49 | bool construct_segments = true, bool display = false); 50 | void calc_hpwl(); 51 | 52 | int minx, maxx, miny, maxy, hpwl, original_net_id; 53 | vector pins; 54 | vector> rsmt; 55 | vector> rsmt_h_segments, rsmt_v_segments; 56 | vector par_num_cpu; 57 | vector par_num_sum_cpu; 58 | vector currentChildIDX_cpu; 59 | vector par_nodes_cpu; 60 | vector child_num_cpu; 61 | vector node_depth_cpu; 62 | vector nodes_cpu; 63 | vector points; 64 | int node_index_cnt = 0; 65 | int MAX_LAYER=10; 66 | int select_root = 0; 67 | 68 | flute::Tree tree; 69 | unordered_map layer; 70 | }; 71 | vector nets; 72 | 73 | 74 | void net::calc_hpwl() { 75 | minx = X, maxx = 0, miny = Y, maxy = 0; 76 | for(auto p : pins) { 77 | int x = p / Y % X, y = p % Y; 78 | minx = min(minx, x); 79 | maxx = max(maxx, x); 80 | miny = min(miny, y); 81 | maxy = max(maxy, y); 82 | } 83 | hpwl = maxx - minx + maxy - miny; 84 | } 85 | 86 | 87 | vector> generate_batches_rsmt(vector &nets2route, int MAX_BATCH_SIZE = 1000000) { 88 | auto _time = elapsed_time(); 89 | vector> batches; 90 | vector> batch_vis; 91 | 92 | auto has_conflict = [&] (int net_id, int batch_id) { 93 | for(auto p : nets[net_id].points) if(batch_vis[batch_id][p]) return true; 94 | return false; 95 | }; 96 | 97 | auto mark_3x3 = [&] (int pos, int batch_id) { 98 | int _x = pos / Y, _y = pos % Y; 99 | for(int x = _x - 1; x <= _x + 1; x++) if(0 <= x && x < X) 100 | for(int y = _y - 1; y < _y + 1; y++) if(0 <= y && y < Y) 101 | if(x == _x || y == _y) batch_vis[batch_id][x * Y + y] = 1; 102 | }; 103 | 104 | 105 | long long segment_len = 0, segment_cnt = 0, failed = 0; 106 | for(auto net_id : nets2route) { 107 | int batch_id = -1; 108 | for(int i = 0; i < batches.size(); i++) if(batches[i].size() < MAX_BATCH_SIZE) 109 | { 110 | if(!has_conflict(net_id, i)) { batch_id = i; break; } 111 | else failed++; 112 | } 113 | if(batch_id == -1) { 114 | batch_id = batches.size(); 115 | batches.emplace_back(vector ()); 116 | batch_vis.emplace_back(vector (X * Y, 0)); 117 | } 118 | batches[batch_id].emplace_back(net_id); 119 | for(auto seg : nets[net_id].rsmt_h_segments) { 120 | segment_len += seg.second / Y - seg.first / Y; 121 | segment_cnt++; 122 | for(auto p = seg.first; p <= seg.second; p += Y) batch_vis[batch_id][p] = 1; 123 | } 124 | for(auto seg : nets[net_id].rsmt_v_segments) { 125 | for(auto p = seg.first; p <= seg.second; p += 1) batch_vis[batch_id][p] = 1; 126 | } 127 | for(auto p : nets[net_id].points) mark_3x3(p, batch_id); 128 | } 129 | _time = elapsed_time() - _time; 130 | if(LOG) cout << setw(40) << "Batch" << setw(20) << "#Nets" << setw(20) << "#Batches" << setw(20) << "Time" << endl; 131 | if(LOG) cout << setw(40) << "Generation" << setw(20) << nets2route.size() << setw(20) << batches.size() << setw(20) << setprecision(2) << _time << endl; 132 | return move(batches); 133 | } 134 | 135 | double TOT_RSMT_LENGTH = 0; 136 | vector> my_flute(unordered_set &pos) { 137 | const int MAX_DEGREE = 10000; 138 | vector x; 139 | vector y; 140 | int cnt = 0; 141 | vector nodes, parent; 142 | vector> edges; 143 | for(auto e : pos) { 144 | x.push_back(db::dr_x[e / Y]); 145 | y.push_back(db::dr_y[e % Y]); 146 | cnt++; 147 | } 148 | auto tree = flute::flute(cnt, x.data(), y.data(), 3); 149 | for(int i = 0; i < cnt * 2 - 2; i++) { 150 | flute::Branch &branch = tree.branch[i]; 151 | nodes.emplace_back(db::dr2gr_x[branch.x] * Y + db::dr2gr_y[branch.y]); 152 | } 153 | sort(nodes.begin(), nodes.end()); 154 | nodes.erase(unique(nodes.begin(), nodes.end()), nodes.end()); 155 | parent.resize(nodes.size()); 156 | for(int i = 0; i < nodes.size(); i++) parent[i] = i; 157 | edges.reserve(cnt * 2); 158 | for(int i = 0; i < cnt * 2 - 2; i++) if(tree.branch[i].n < cnt * 2 - 2) { 159 | Branch &branch1 = tree.branch[i], &branch2 = tree.branch[branch1.n]; 160 | int u, v; 161 | u = lower_bound(nodes.begin(), nodes.end(), db::dr2gr_x[branch1.x] * Y + db::dr2gr_y[branch1.y]) - nodes.begin(); 162 | v = lower_bound(nodes.begin(), nodes.end(), db::dr2gr_x[branch2.x] * Y + db::dr2gr_y[branch2.y]) - nodes.begin(); 163 | if(u == v) continue; 164 | edges.emplace_back(make_tuple(abs(branch1.x - branch2.x) + abs(branch1.y - branch2.y), u, v)); 165 | 166 | } 167 | sort(edges.begin(), edges.end()); 168 | function find_parent = [&] (int x) { return x == parent[x] ? x : parent[x] = find_parent(parent[x]); }; 169 | vector> graph(nodes.size()); 170 | for(auto edge : edges) { 171 | int u = get<1> (edge), v = get<2> (edge), par_u = find_parent(u), par_v = find_parent(v); 172 | if(par_u != par_v) { 173 | graph[u].emplace_back(v); 174 | graph[v].emplace_back(u); 175 | TOT_RSMT_LENGTH += get<0> (edge); 176 | parent[par_u] = par_v; 177 | } 178 | } 179 | int tot_degree = 0; 180 | for(int i = 0; i < nodes.size(); i++) tot_degree += graph[i].size(); 181 | assert(tot_degree == 2 * (nodes.size() - 1)); 182 | graph.emplace_back(move(nodes)); 183 | return move(graph); 184 | } 185 | 186 | void net::construct_rsmt() { 187 | unordered_map layer; 188 | unordered_set pos, nodes; 189 | for(int i = 0; i < pins.size(); i++) { 190 | int pos2D = pins[i] % (X * Y); 191 | pos.insert(pos2D); 192 | layer[pos2D] = pins[i] / X / Y; 193 | } 194 | assert(pos.size() == pins.size()); 195 | 196 | rsmt = my_flute(pos); 197 | rsmt_h_segments.clear(); 198 | rsmt_v_segments.clear(); 199 | rsmt_h_segments.reserve(rsmt.size()); 200 | rsmt_v_segments.reserve(rsmt.size()); 201 | points = rsmt.back(); 202 | for(int i = 0; i < rsmt.back().size(); i++) { 203 | int xi = rsmt.back()[i] / Y, yi = rsmt.back()[i] % Y; 204 | for(auto j : rsmt[i]) if(j < i) { 205 | int xj = rsmt.back()[j] / Y, yj = rsmt.back()[j] % Y; 206 | int minx = min(xi, xj), maxx = max(xi, xj), miny = min(yi, yj), maxy = max(yi, yj); 207 | if(xi != xj && yi != yj) { 208 | rsmt_h_segments.emplace_back(minx * Y + miny, maxx * Y + miny); 209 | rsmt_h_segments.emplace_back(minx * Y + maxy, maxx * Y + maxy); 210 | rsmt_v_segments.emplace_back(minx * Y + miny, minx * Y + maxy); 211 | rsmt_v_segments.emplace_back(maxx * Y + miny, maxx * Y + maxy); 212 | points.emplace_back(xi * Y + yj); 213 | points.emplace_back(xj * Y + yi); 214 | } else if(xi != xj) { 215 | rsmt_h_segments.emplace_back(minx * Y + miny, maxx * Y + miny); 216 | } else if(yi != yj) { 217 | rsmt_v_segments.emplace_back(minx * Y + miny, minx * Y + maxy); 218 | } else { 219 | cerr << "error" << endl; 220 | } 221 | } 222 | } 223 | for(auto &e : rsmt.back()) 224 | e += (layer.count(e) ? layer[e] : L) * X * Y; 225 | } 226 | 227 | 228 | typedef unsigned int BITSET_TYPE; 229 | const int BITSET_LEN = 32; 230 | 231 | int select_root_net(vector> rsmt) 232 | { 233 | queue> list; 234 | int visited[rsmt.size()-1]; 235 | int select = -1; 236 | for(int i=0; i < rsmt.size()-1; i++) 237 | { 238 | visited[i]=0; 239 | if(rsmt[i].size()==1) 240 | { 241 | list.push(make_pair(i, -1)); 242 | } 243 | } 244 | while(!list.empty()) 245 | { 246 | pair front_element = list.front(); 247 | list.pop(); 248 | select = front_element.first; 249 | if(visited[select]) continue; 250 | visited[select]=1; 251 | int fa = front_element.second; 252 | for(int j = 0; j< rsmt[select].size(); j++) 253 | { 254 | if(rsmt[select][j]!=fa) 255 | { 256 | list.push(make_pair(rsmt[select][j], select)); 257 | } 258 | } 259 | } 260 | return select; 261 | } 262 | 263 | void net::generate_detours(bool *congestionView_cpu, float *congestionView_xsum_cpu, float *congestionView_ysum_cpu, 264 | bool construct_segments, bool display) { 265 | 266 | auto graph_x = rsmt; 267 | int node_estimate = (graph_x.size()-1)*10; 268 | 269 | par_num_cpu.clear(); 270 | par_num_sum_cpu.clear(); 271 | currentChildIDX_cpu.clear(); 272 | par_nodes_cpu.clear(); 273 | child_num_cpu.clear(); 274 | node_depth_cpu.clear(); 275 | nodes_cpu.clear(); 276 | node_index_cnt = 0; 277 | 278 | par_num_cpu.reserve(node_estimate); 279 | par_num_sum_cpu.reserve(node_estimate); 280 | currentChildIDX_cpu.reserve(node_estimate); 281 | par_nodes_cpu.reserve(node_estimate); 282 | child_num_cpu.reserve(node_estimate); 283 | node_depth_cpu.reserve(node_estimate); 284 | nodes_cpu.reserve(node_estimate); 285 | 286 | par_num_cpu.emplace_back(0); 287 | par_num_sum_cpu.emplace_back(0); 288 | currentChildIDX_cpu.emplace_back(0); 289 | currentChildIDX_cpu.emplace_back(0); 290 | currentChildIDX_cpu.emplace_back(0); 291 | currentChildIDX_cpu.emplace_back(0); 292 | par_nodes_cpu.emplace_back(0); 293 | child_num_cpu.emplace_back(0); 294 | node_depth_cpu.emplace_back(0); 295 | nodes_cpu.emplace_back(0); 296 | 297 | int depth_max = 0; 298 | select_root = select_root_net(graph_x); 299 | vector congestionRegionID[2]; 300 | vector >> congestionRanges; 301 | congestionRanges.resize(2); 302 | vector>> stems; 303 | stems.resize(2); 304 | congestionRegionID[0].resize(graph_x.size()); 305 | congestionRegionID[1].resize(graph_x.size()); 306 | for(int g_i = 0; g_i < graph_x.size(); g_i++) 307 | { 308 | congestionRegionID[0][g_i] = -1; 309 | congestionRegionID[1][g_i] = -1; 310 | } 311 | congestionRanges[0].resize(graph_x.size()); 312 | congestionRanges[1].resize(graph_x.size()); 313 | stems[0].resize(graph_x.size()); 314 | stems[1].resize(graph_x.size()); 315 | function getRegionID = [&] (int x, int direction) { 316 | if(x==-1) return -1; 317 | if(congestionRegionID[direction][x] == -1) return -1; 318 | if(congestionRegionID[direction][x] != x) 319 | { 320 | if(congestionRegionID[direction][x] == x) 321 | { 322 | assert(0); 323 | } 324 | int ans = getRegionID(congestionRegionID[direction][x], direction); 325 | congestionRegionID[direction][x] = ans; 326 | return ans; 327 | }else{ 328 | return congestionRegionID[direction][x]; 329 | } 330 | }; 331 | for(int x = 0; x < graph_x.back().size(); x++) 332 | { 333 | int position_cur = graph_x.back()[x]; 334 | int curl = position_cur / Y /X, curx = position_cur / Y % X, cury = position_cur % Y; 335 | for(int dir=0; dir<2; dir++) 336 | { 337 | if(curl markCongestion = [&] (int x, int par) { 348 | int position_cur = graph_x.back()[x]; 349 | int curx = position_cur / Y % X, cury = position_cur % Y; 350 | for(auto e : graph_x[x]) if(e != par) 351 | { 352 | int ex = graph_x.back()[e] / Y % X; 353 | int ey = graph_x.back()[e] % Y; 354 | int dir=-1; 355 | int congestion = -1; 356 | if(ex == curx) 357 | { 358 | bool is_congestion_y = (congestionView_ysum_cpu[curx*Y+max(ey, cury)]-congestionView_ysum_cpu[curx*Y+min(ey, cury)])>0; 359 | if(is_congestion_y) 360 | { 361 | dir = 1; 362 | congestion = 1; 363 | } 364 | } 365 | else if(ey == cury) 366 | { 367 | bool is_congestion_x = (congestionView_xsum_cpu[max(ex, curx)*Y+cury]-congestionView_xsum_cpu[min(ex, curx)*Y+cury])>0; 368 | if(is_congestion_x) 369 | { 370 | dir = 0; 371 | congestion = 1; 372 | } 373 | } 374 | else{ 375 | bool is_congestion_y = (congestionView_ysum_cpu[curx*Y+max(ey, cury)]-congestionView_ysum_cpu[curx*Y+min(ey, cury)])>0 || 376 | (congestionView_ysum_cpu[ex*Y+max(ey, cury)]-congestionView_ysum_cpu[ex*Y+min(ey, cury)])>0; 377 | if(is_congestion_y) 378 | { 379 | dir = 1; 380 | congestion = 2; 381 | } 382 | if(congestion==-1) 383 | { 384 | 385 | bool is_congestion_x = (congestionView_xsum_cpu[max(ex, curx)*Y+cury]-congestionView_xsum_cpu[min(ex, curx)*Y+cury])>0 || 386 | (congestionView_xsum_cpu[max(ex, curx)*Y+ey]-congestionView_xsum_cpu[min(ex, curx)*Y+ey])>0; 387 | if(is_congestion_x) 388 | { 389 | dir = 0; 390 | congestion = 2; 391 | } 392 | } 393 | } 394 | if(congestion == 1) 395 | { 396 | int target_region = -1; 397 | if(x!=select_root) 398 | { 399 | if(congestionRegionID[dir][x]==-1) 400 | { 401 | congestionRegionID[dir][x] = x; 402 | } 403 | int region_x = getRegionID(x, dir); 404 | target_region = region_x; 405 | }else{ 406 | if(congestionRegionID[dir][e]==-1) 407 | { 408 | congestionRegionID[dir][e] = e; 409 | } 410 | int region_e = getRegionID(e, dir); 411 | target_region = region_e; 412 | } 413 | 414 | congestionRegionID[dir][e] = target_region; 415 | if(x!=select_root) 416 | for(auto pos: stems[dir][e]) 417 | { 418 | stems[dir][target_region].push_back(pos); 419 | } 420 | congestionRanges[dir][target_region].first = min(congestionRanges[dir][target_region].first, congestionRanges[dir][e].first); 421 | congestionRanges[dir][target_region].second = max(congestionRanges[dir][target_region].second, congestionRanges[dir][e].second); 422 | } 423 | else if(congestion == 2) 424 | { 425 | if(x!=select_root&&congestionRegionID[dir][x]==-1) 426 | { 427 | congestionRegionID[dir][x] = x; 428 | 429 | } 430 | if(congestionRegionID[dir][e]==-1) 431 | { 432 | congestionRegionID[dir][e] = e; 433 | } 434 | } 435 | markCongestion(e, x); 436 | } 437 | for(int dir=0; dir<2; dir++) 438 | { 439 | int x_region = getRegionID(x, dir); 440 | for(auto e : graph_x[x]) if(e != par) 441 | { 442 | int ex = graph_x.back()[e] / Y % X; 443 | int ey = graph_x.back()[e] % Y; 444 | int e_region = getRegionID(e, dir); 445 | if(x_region!=e_region) 446 | { 447 | if(x_region>=0) 448 | { 449 | stems[dir][x_region].push_back(dir?ex:ey); 450 | congestionRanges[dir][x_region].first = min(congestionRanges[dir][x_region].first, dir?ey:ex); 451 | congestionRanges[dir][x_region].second = max(congestionRanges[dir][x_region].second, dir?ey:ex); 452 | } 453 | if(e_region>=0) 454 | { 455 | stems[dir][e_region].push_back(dir?curx:cury); 456 | congestionRanges[dir][e_region].first = min(congestionRanges[dir][e_region].first, dir?cury:curx); 457 | congestionRanges[dir][e_region].second = max(congestionRanges[dir][e_region].second, dir?cury:curx); 458 | } 459 | } 460 | } 461 | } 462 | }; 463 | markCongestion(select_root, -1); 464 | 465 | function create_node = [&] (int l, int x, int y, int num_child) { 466 | int node_idx_insert = node_index_cnt++; 467 | par_num_cpu.emplace_back(0); 468 | par_num_sum_cpu.emplace_back(0); 469 | par_nodes_cpu.emplace_back(0); 470 | child_num_cpu.emplace_back(0); 471 | node_depth_cpu.emplace_back(0); 472 | nodes_cpu.emplace_back(0); 473 | 474 | par_num_sum_cpu[node_idx_insert+1] = 0; 475 | node_depth_cpu[node_idx_insert] = 0; 476 | child_num_cpu[node_idx_insert] = num_child; 477 | nodes_cpu[node_idx_insert] = l * X * Y + x * Y + y; 478 | return node_idx_insert; 479 | }; 480 | 481 | function connect_node = [&] (int par_node_index, int cur_index, int cur_child_id) { 482 | int node_idx_insert = cur_index; 483 | int position_cur = nodes_cpu[cur_index]; 484 | int curx = position_cur/ Y % X, cury = position_cur % Y; 485 | int position_par = nodes_cpu[par_node_index]; 486 | int parx = position_par/ Y % X, pary = position_par % Y; 487 | if(construct_segments) 488 | { 489 | if(curx==parx) 490 | { 491 | rsmt_v_segments.emplace_back(make_pair(curx*Y+min(cury, pary), curx*Y+max(cury, pary))); 492 | } 493 | if(cury==pary) 494 | { 495 | rsmt_h_segments.emplace_back(make_pair(min(curx, parx)+cury*X, max(curx, parx)+cury*X)); 496 | } 497 | } 498 | points.emplace_back(curx*Y+cury); 499 | points.emplace_back(parx*Y+pary); 500 | assert(curx==parx||cury==pary); 501 | node_depth_cpu[node_idx_insert] = max(node_depth_cpu[node_idx_insert], node_depth_cpu[par_node_index] + 1); 502 | int depth = node_depth_cpu[node_idx_insert]; 503 | 504 | int position = par_num_sum_cpu[node_idx_insert]+par_num_cpu[node_idx_insert]++; 505 | par_nodes_cpu.emplace_back(0); 506 | currentChildIDX_cpu.emplace_back(0); 507 | par_nodes_cpu[position] = par_node_index; 508 | depth_max = max(depth_max, depth+1); 509 | assert(cur_child_id calc_displace = [&] (int query_pos, int dir, int region_id) { 516 | int ans = 0; 517 | for(auto pos: stems[dir][region_id]) 518 | { 519 | ans+=abs(pos-query_pos); 520 | } 521 | return ans; 522 | }; 523 | function(int, int)> get_mirror_places = [&] (int graph_node_id, int dir) { 524 | assert(graph_node_id=0); 527 | int congestion_region = congestionRegionID[dir][graph_node_id]; 528 | int curx = position_cur/ Y % X, cury = position_cur % Y; 529 | int trunk_len = congestionRanges[dir][congestion_region].second - congestionRanges[dir][congestion_region].first; 530 | int max_displace = ratio*float(trunk_len); 531 | int origional_pos = dir?curx:cury; 532 | int origional_displacement = calc_displace(origional_pos, dir, congestion_region); 533 | int init_low = origional_pos; 534 | int init_high = origional_pos; 535 | int bound = dir?X:Y; 536 | while (init_low - 1 >= 0 && calc_displace(init_low - 1, dir, congestion_region) - origional_displacement <= max_displace) init_low--; 537 | while (init_high + 1 < bound && calc_displace(init_high - 1, dir, congestion_region) - origional_displacement <= max_displace) init_high++; 538 | int step = 1; 539 | while ((origional_pos - init_low) / (step + 1) + (init_high - origional_pos) / (step + 1) >= num_tracks) step++; 540 | init_low = origional_pos - (origional_pos - init_low) / step * step; 541 | init_high = origional_pos + (init_high - origional_pos) / step * step; 542 | vector shifts; 543 | for (double pos = init_low; pos <= init_high; pos += step) { 544 | int shiftAmount = (pos - origional_pos); 545 | if(shiftAmount==0) continue; 546 | shifts.push_back(pos); 547 | int min_trunk = congestionRanges[dir][congestion_region].first; 548 | int max_trunk = congestionRanges[dir][congestion_region].second; 549 | } 550 | std::vector indices(shifts.size()); 551 | for (size_t i = 0; i < indices.size(); ++i) { 552 | indices[i] = i; 553 | } 554 | std::vector new_shifts; 555 | for (int index : indices) { 556 | new_shifts.push_back(shifts[index]); 557 | } 558 | 559 | shifts = new_shifts; 560 | return shifts; 561 | }; 562 | 563 | function>, vector>)> dfs_detours = [&] (int x, int par, int par_node_idx, int child_idx, int depth, vector> mirrors, vector> old_mirror_places) { 564 | if(mirrors.size()==0) 565 | { 566 | mirrors.resize(2); 567 | } 568 | int size = graph_x.back().size() - 1; 569 | int position_cur = graph_x.back()[x]; 570 | int curl = position_cur / Y /X, curx = position_cur/ Y % X, cury = position_cur % Y; 571 | int node_idx = -1; 572 | if(x==select_root) 573 | { 574 | node_idx = create_node(curl,curx,cury, graph_x[x].size()); 575 | par_num_sum_cpu[node_idx+1] += par_num_cpu[node_idx]; 576 | par_num_sum_cpu[node_idx+1] += par_num_sum_cpu[node_idx]; 577 | } 578 | vector> new_mirrors; 579 | vector> mirror_places; 580 | new_mirrors.resize(2); 581 | mirror_places.resize(2); 582 | if(old_mirror_places.size()==2) 583 | for(int dir=0; dir<2; dir++) 584 | { 585 | int region_id = getRegionID(congestionRegionID[dir][x], dir); 586 | int par_region_id = getRegionID(congestionRegionID[dir][par], dir); 587 | if(region_id >= 0) 588 | { 589 | assert(region_id0); 645 | new_mirrors[dir].push_back(new_mirror); 646 | par_num_sum_cpu[new_mirror + 1] += par_num_cpu[new_mirror]; 647 | par_num_sum_cpu[new_mirror + 1] += par_num_sum_cpu[new_mirror]; 648 | } 649 | } 650 | } 651 | if(par_node_idx == -1){} 652 | else { 653 | int px = nodes_cpu[par_node_idx] / Y % X, py = nodes_cpu[par_node_idx] % Y; 654 | vector pre_node_idxs; 655 | vector pre_node_idxs_direct; 656 | if(px != curx && py != cury) 657 | { 658 | for (int pathIndex = 0; pathIndex <= 1; pathIndex++) { 659 | int midx = pathIndex ? curx : px; 660 | int midy = pathIndex ? py : cury; 661 | int pre_node = create_node(MAX_LAYER-1, midx, midy, 1); 662 | connect_node(par_node_idx, pre_node, child_idx); 663 | par_num_sum_cpu[pre_node + 1] += par_num_cpu[pre_node]; 664 | par_num_sum_cpu[pre_node + 1] += par_num_sum_cpu[pre_node]; 665 | pre_node_idxs.push_back(pre_node); 666 | } 667 | 668 | for (int pathIndex = 0; pathIndex <= 1; pathIndex++) { 669 | int length_edge = pathIndex ? (max(py, cury) - min(py, cury)) : (max(px, curx) - min(px, curx)); 670 | int max_z_shape = min(10, length_edge); 671 | for(int dispace_id = 1; dispace_id < max_z_shape; dispace_id++) 672 | { 673 | int midx1 = pathIndex ? px : (px*dispace_id+curx*(max_z_shape-dispace_id))/max_z_shape; 674 | int midy1 = pathIndex ? (py*dispace_id+cury*(max_z_shape-dispace_id))/max_z_shape : py; 675 | int pre_node1 = create_node(MAX_LAYER-1, midx1, midy1, 1); 676 | connect_node(par_node_idx, pre_node1, child_idx); 677 | par_num_sum_cpu[pre_node1 + 1] += par_num_cpu[pre_node1]; 678 | par_num_sum_cpu[pre_node1 + 1] += par_num_sum_cpu[pre_node1]; 679 | 680 | int midx2 = pathIndex ? curx : (px*dispace_id+curx*(max_z_shape-dispace_id))/max_z_shape; 681 | int midy2 = pathIndex ? (py*dispace_id+cury*(max_z_shape-dispace_id))/max_z_shape : cury; 682 | int pre_node2 = create_node(MAX_LAYER-1, midx2, midy2, 1); 683 | connect_node(pre_node1, pre_node2, 0); 684 | par_num_sum_cpu[pre_node2 + 1] += par_num_cpu[pre_node2]; 685 | par_num_sum_cpu[pre_node2 + 1] += par_num_sum_cpu[pre_node2]; 686 | pre_node_idxs.push_back(pre_node2); 687 | } 688 | } 689 | } 690 | for(int dir = 0; dir<2; dir++) 691 | { 692 | int region_id = getRegionID(congestionRegionID[dir][x], dir); 693 | int par_region_id = getRegionID(congestionRegionID[dir][par], dir); 694 | if(par_region_id>=0&®ion_id!=par_region_id) 695 | { 696 | for(auto node_par_mirror: mirrors[dir]) 697 | { 698 | int position_par = nodes_cpu[node_par_mirror]; 699 | int parx = position_par/ Y % X, pary = position_par % Y; 700 | if(parx!=curx&&pary!=cury) 701 | { 702 | for (int pathIndex = 0; pathIndex <= 1; pathIndex++) { 703 | int midx = pathIndex ? curx : parx; 704 | int midy = pathIndex ? pary : cury; 705 | int node_insert = create_node(MAX_LAYER-1, midx, midy, 1); 706 | connect_node(node_par_mirror, node_insert, child_idx); 707 | par_num_sum_cpu[node_insert + 1] += par_num_cpu[node_insert] + par_num_sum_cpu[node_insert]; 708 | pre_node_idxs.push_back(node_insert); 709 | } 710 | }else{ 711 | pre_node_idxs_direct.push_back(node_par_mirror); 712 | } 713 | } 714 | } 715 | } 716 | int connect_parent = par_node_idx; 717 | int connect_child_idx = child_idx; 718 | 719 | node_idx = create_node(curl, curx, cury, graph_x[x].size() - 1); 720 | 721 | if(px == curx || py == cury){ 722 | connect_node(connect_parent, node_idx, connect_child_idx); 723 | } 724 | if(x!=select_root) 725 | { 726 | for(auto pre_node_idx: pre_node_idxs) 727 | { 728 | connect_node(pre_node_idx, node_idx, 0); 729 | } 730 | for(auto pre_node_idx: pre_node_idxs_direct) 731 | { 732 | connect_node(pre_node_idx, node_idx, child_idx); 733 | } 734 | } 735 | } 736 | depth_max = max(depth_max, node_depth_cpu[node_idx]+1); 737 | if(x!=select_root) 738 | { 739 | par_num_sum_cpu[node_idx+1] += (par_num_sum_cpu[node_idx] + par_num_cpu[node_idx]); 740 | } 741 | for(int dir=0; dir<2; dir++) 742 | { 743 | int region_id = getRegionID(x, dir); 744 | int par_region_id = getRegionID(par, dir); 745 | if(region_id<0) continue; 746 | if(true) 747 | { 748 | int pos2 = dir?cury:curx; 749 | assert(region_id>=0||region_id= X * Y) p -= X * Y; 832 | db_net.subnets.emplace_back(nets.size()); 833 | nets.emplace_back(move(new_net)); 834 | db_net.unfinished_subnet_count = db_net.subnets.size(); 835 | continue; 836 | } 837 | net_break_count++; 838 | vector pins = db_net.pins, sz(db_net.pins.size(), 1), par(db_net.pins.size()); 839 | vector> edges; 840 | edges.reserve(db_net.pins.size() * db_net.pins.size() / 2); 841 | 842 | function find_par = [&] (int x) { return x == par[x] ? x : par[x] = find_par(par[x]); }; 843 | 844 | for(int i = 0; i < db_net.pins.size(); i++) { 845 | par[i] = i; 846 | if(pins[i] >= X * Y) pins[i] -= X * Y; 847 | for(int j = 0; j < i; j++) { 848 | int x0 = db_net.pins[i] / Y % X, y0 = db_net.pins[i] % Y; 849 | int x1 = db_net.pins[j] / Y % X, y1 = db_net.pins[j] % Y; 850 | edges.emplace_back(make_tuple(j, i, abs(x0 - x1) + abs(y0 - y1))); 851 | } 852 | } 853 | sort(edges.begin(), edges.end(), [&] (tuple l, tuple r) { 854 | return get<2> (l) < get<2> (r); 855 | }); 856 | for(auto e : edges) { 857 | int u = find_par(get<0> (e)), v = find_par(get<1> (e)); 858 | if(u == v || sz[u] + sz[v] > MAX_PIN_SIZE) continue; 859 | if(sz[u] > sz[v]) swap(u, v); 860 | par[u] = v; 861 | sz[v] += sz[u]; 862 | } 863 | vector> new_pins(pins.size()); 864 | for(int i = 0; i < pins.size(); i++) 865 | new_pins[find_par(i)].emplace_back(pins[i]); 866 | for(auto e : edges) { 867 | int u = get<0> (e), v = get<1> (e); 868 | int par_u = find_par(u), par_v = find_par(v); 869 | if(par_u == par_v) continue; 870 | if(sz[par_u] > sz[par_v]) { 871 | swap(u, v); 872 | swap(par_u, par_v); 873 | } 874 | sz[par_u]++; 875 | new_pins[par_u].emplace_back(pins[v]); 876 | par[par_v] = par_u; 877 | db_net.subnets.emplace_back(nets.size()); 878 | nets.emplace_back(net()); 879 | nets.back().pins = move(new_pins[par_v]); 880 | nets.back().original_net_id = db_net_id; 881 | } 882 | for(int i = 0; i < pins.size(); i++) if(find_par(i) == i) { 883 | db_net.subnets.emplace_back(nets.size()); 884 | nets.emplace_back(net()); 885 | nets.back().pins = move(new_pins[i]); 886 | nets.back().original_net_id = db_net_id; 887 | } 888 | db_net.unfinished_subnet_count = db_net.subnets.size(); 889 | } 890 | pin_cnt_sum_cpu.resize(1 + nets.size(), 0); 891 | for(int i = 0; i < nets.size(); i++) { 892 | nets[i].calc_hpwl(); 893 | pin_cnt_sum_cpu[i + 1] = pin_cnt_sum_cpu[i] + nets[i].pins.size(); 894 | } 895 | printf(" MAX PINS: %d\n", max_pin_cnt); 896 | printf(" Broken Nets: %d\n", net_break_count); 897 | 898 | 899 | NET_NUM = nets.size(); 900 | 901 | 902 | DIR = db::layers[1].dir; 903 | unit_length_wire_cost = db::unit_length_wire_cost; 904 | unit_via_cost = db::unit_via_cost; 905 | 906 | assert(X - 1 <= MAX_LEN_INT); 907 | for(int i = 0; i < X - 1; i++) temp_int[i] = db::x_edge_len[i]; 908 | cudaMalloc(&x_edge_len, (X - 1) * sizeof(int)); 909 | cudaMemcpy(x_edge_len, temp_int, (X - 1) * sizeof(int), cudaMemcpyHostToDevice); 910 | 911 | 912 | assert(Y - 1 <= MAX_LEN_INT); 913 | for(int i = 0; i < Y - 1; i++) temp_int[i] = db::y_edge_len[i]; 914 | cudaMalloc(&y_edge_len, (Y - 1) * sizeof(int)); 915 | cudaMemcpy(y_edge_len, temp_int, (Y - 1) * sizeof(int), cudaMemcpyHostToDevice); 916 | 917 | assert(L <= MAX_LEN_DOUBLE); 918 | for(int i = 0; i < L; i++) temp_double[i] = db::unit_length_short_costs[i + 1]; 919 | cudaMalloc(&unit_length_short_costs, L * sizeof(double)); 920 | cudaMemcpy(unit_length_short_costs, temp_double, L * sizeof(double), cudaMemcpyHostToDevice); 921 | 922 | assert(L <= MAX_LEN_DOUBLE); 923 | for(int i = 0; i < L; i++) temp_double[i] = db::layers[i + 1].min_len; 924 | cudaMalloc(&layer_min_len, L * sizeof(double)); 925 | cudaMemcpy(layer_min_len, temp_double, L * sizeof(double), cudaMemcpyHostToDevice); 926 | 927 | 928 | assert(L * X * Y <= MAX_LEN_DOUBLE); 929 | for(int l = 0; l < L; l++) 930 | for(int x = 0; x < X; x++) 931 | for(int y = 0; y < Y; y++) 932 | temp_float[IDX(l, x, y)] = db::capacity[l + 1][x][y]; 933 | cudaMalloc(&capacity, L * X * Y * sizeof(float)); 934 | cudaMemcpy(capacity, temp_float, L * X * Y * sizeof(float), cudaMemcpyHostToDevice); 935 | 936 | 937 | cudaMalloc(&pin_acc_num, (1 + NET_NUM) * sizeof(int)); 938 | cudaMemcpy(pin_acc_num, pin_cnt_sum_cpu.data(), (1 + NET_NUM) * sizeof(int), cudaMemcpyHostToDevice); 939 | PIN_NUM = pin_cnt_sum_cpu.back(); 940 | 941 | if(LOG) cerr << "PIN_NUM " << PIN_NUM << endl; 942 | 943 | 944 | assert(PIN_NUM <= MAX_LEN_INT); 945 | 946 | for(auto &dbnet : db::nets) 947 | for(auto pin : dbnet.pins) 948 | if(pin < X * Y) total_via_count++; 949 | 950 | for(int i = 0, pin_id = 0; i < NET_NUM; i++) 951 | for(auto pin : nets[i].pins) temp_int[pin_id++] = pin; 952 | cudaMalloc(&pins, PIN_NUM * sizeof(int)); 953 | cudaMemcpy(pins, temp_int, PIN_NUM * sizeof(int), cudaMemcpyHostToDevice); 954 | 955 | 956 | net_x_cpu.resize(NET_NUM); 957 | net_y_cpu.resize(NET_NUM); 958 | 959 | 960 | all_track_cnt = 0; 961 | for(int l = 0; l < L; l++) all_track_cnt += (l & 1 ^ DIR) ? X : Y; 962 | assert(all_track_cnt <= MAX_LEN_INT); 963 | for(int l = 0, cnt = 0; l < L; l++) { 964 | if((l & 1 ^ DIR) == 0) for(int y = 0; y < Y; y++) temp_int[cnt++] = l * XY + y; 965 | if((l & 1 ^ DIR) == 1) for(int x = 0; x < X; x++) temp_int[cnt++] = l * XY + x; 966 | if(l + 1 == L) assert(cnt == all_track_cnt); 967 | } 968 | cudaMalloc(&idx2track, all_track_cnt * sizeof(int)); 969 | cudaMemcpy(idx2track, temp_int, sizeof(int) * all_track_cnt, cudaMemcpyHostToDevice); 970 | 971 | 972 | cudaMalloc(&congestion, X * Y * sizeof(bool)); 973 | cudaMalloc(&congestion_xsum, X * Y * sizeof(float)); 974 | cudaMalloc(&congestion_ysum, X * Y * sizeof(float)); 975 | 976 | 977 | cudaMallocManaged(&routes, (ROUTE_PER_PIN * PIN_NUM) * sizeof(int)); 978 | 979 | cudaMalloc(&wcost, L * X * Y * sizeof(float)); 980 | cudaMalloc(&vcost, L * X * Y * sizeof(float)); 981 | cudaMalloc(&presum, L * X * Y * sizeof(double)); 982 | cudaMalloc(&demand, L * X * Y * sizeof(float)); 983 | cudaMalloc(&pre_demand, L * X * Y * sizeof(int)); 984 | 985 | 986 | 987 | cudaMalloc(&net_ids, NET_NUM * sizeof(int)); 988 | cudaMalloc(&is_of_net, NET_NUM * sizeof(bool)); 989 | cudaMalloc(&of_edge_sum, L * X * Y * sizeof(int)); 990 | cudaMalloc(&last, L * X * Y * sizeof(int)); 991 | cudaMalloc(×tamp, L * X * Y * sizeof(int)); 992 | 993 | 994 | 995 | 996 | cudaMemset(demand, 0, sizeof(float) * L * X * Y); 997 | cudaMemset(timestamp, 0, sizeof(int) * L * X * Y); 998 | cudaMemset(pre_demand, 0, sizeof(int) * L * X * Y); 999 | 1000 | net_ids_cpu = new int[NET_NUM]; 1001 | } 1002 | 1003 | } 1004 | 1005 | using namespace cudb; 1006 | --------------------------------------------------------------------------------