├── README.md ├── data └── subgraphs │ └── yelp_sub.npz └── sw ├── README.md └── redundancy_reduction └── rr.py /README.md: -------------------------------------------------------------------------------- 1 | # GraphACT: Accelerating GCN Training on CPU-FPGA Heterogeneous Platforms 2 | 3 | Hanqing Zeng, Viktor Prasanna 4 | 5 | Contact: 6 | 7 | Hanqing Zeng (zengh@usc.edu) 8 | 9 | **Updates** 10 | 11 | 03/05/2021: We have released the IP cores for GraphACT at [this repository](https://github.com/GraphSAINT/GNN-ARCH). 12 | * The IP cores improve upon the GraphACT design by supporting two computation orders of feature aggregation and weight transformation. See [our ASAP paper](https://ieeexplore.ieee.org/abstract/document/9153263) for description of the two orders. 13 | * The IP cores now support both the training and inference algorithms on FPGA. We will add in the current repo soon the complete training architecture with those IP cores as the building block. 14 | 15 | We will also soon release the C++ parallel implementation of the redundancy reduction algorithm in the current repo. 16 | 17 | **NOTE** 18 | 19 | * The GCN training algorithm, together with the implementation is based on the paper ``Accurate, Efficient and Scalable Graph Embedding'' in IEEE/IPDPS '19. 20 | * Or, you can refer to our more recent [ICLR '20 paper](https://arxiv.org/abs/1907.04931) (and its [implementation](https://github.com/GraphSAINT/GraphSAINT)) for a better graph sampling based minibatch training algorithm. 21 | * The implementation for redundancy reduction algorithm, FPGA architecture and the performance model will be uploaded soon. 22 | 23 | 24 | **Citation** 25 | 26 | ``` 27 | @inproceedings{graphact, 28 | author = {Zeng, Hanqing and Prasanna, Viktor}, 29 | title = {GraphACT: Accelerating GCN Training on CPU-FPGA Heterogeneous Platforms}, 30 | year = {2020}, 31 | isbn = {9781450370998}, 32 | publisher = {Association for Computing Machinery}, 33 | address = {New York, NY, USA}, 34 | url = {https://doi.org/10.1145/3373087.3375312}, 35 | doi = {10.1145/3373087.3375312}, 36 | booktitle = {Proceedings of the 2020 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays}, 37 | pages = {255–265}, 38 | numpages = {11}, 39 | location = {Seaside, CA, USA}, 40 | series = {FPGA '20} 41 | } 42 | ``` 43 | -------------------------------------------------------------------------------- /data/subgraphs/yelp_sub.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GraphSAINT/GraphACT/ede8b95c703a3310e0f2a1ec4c77e324c6a15840/data/subgraphs/yelp_sub.npz -------------------------------------------------------------------------------- /sw/README.md: -------------------------------------------------------------------------------- 1 | ### Redundancy Reduction 2 | 3 | Currently, the (slow) python implementation of redundancy reduction is provided. The parallel C++ implementation will come soon. 4 | 5 | To run the python redundancy reduction, go into the `redundancy_reduction` directory and execute: 6 | 7 | ``` 8 | python rr.py --adj --round 9 | ``` 10 | 11 | An example subgraph adj can be found at `../data/subgraphs/yelp_sub.npz` 12 | -------------------------------------------------------------------------------- /sw/redundancy_reduction/rr.py: -------------------------------------------------------------------------------- 1 | import scipy.sparse as sp 2 | import scipy 3 | import numpy as np 4 | import argparse 5 | 6 | from operator import itemgetter 7 | import time 8 | 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser(description='arguments for redundancy reduction') 12 | parser.add_argument('--adj', type=str, required=True, help='the path to the adj file (scipy.csr_matrix stored as npz)') 13 | parser.add_argument('--round', type=int, required=True, help='total number of rounds to perform reduction') 14 | args = parser.parse_args() 15 | return args 16 | 17 | 18 | 19 | def construct_ga(adj_gs): 20 | t1 = time.time() 21 | assert adj_gs.shape[0] == adj_gs.shape[1], "diagonal of subgraph should be 0" 22 | num_v = adj_gs.shape[0] 23 | weight_edges = dict() 24 | for v in range(num_v): 25 | neigh = np.sort(adj_gs.indices[adj_gs.indptr[v]:adj_gs.indptr[v+1]]) 26 | #assert v not in neigh 27 | for iu,u in enumerate(neigh): 28 | for w in neigh[iu+1:]: 29 | if (u,w) not in weight_edges: 30 | weight_edges[(u,w)] = 1 31 | else: 32 | weight_edges[(u,w)] += 1 33 | return weight_edges, num_v 34 | 35 | 36 | def obtain_precompute_edges(weight_edges,num_v): 37 | """ 38 | operate on ga 39 | """ 40 | M = [] 41 | H = {k:v for k,v in weight_edges.items() if v>2} 42 | H_sorted = sorted(H.items(),key=itemgetter(1,0),reverse=True) 43 | S = np.ones(num_v) 44 | _W = 0 45 | for (u,v),weight in H_sorted: 46 | if not (S[u] and S[v]): 47 | continue 48 | _W += weight-1 49 | S[u] = 0; S[v] = 0 50 | M.append((u,v)) 51 | if len(M) == int(num_v/2): 52 | break 53 | return M,_W 54 | 55 | 56 | def obtain_compact_mat(adj_gs,M,feat): 57 | """ 58 | obtain updated gs from M 59 | """ 60 | ret_feat = np.zeros(feat.size+len(M)) 61 | ret_feat[:feat.size] = feat 62 | idx = 0 63 | deg = np.ediff1d(adj_gs.indptr) 64 | num_v = deg.size 65 | # transpose adj first 66 | t1 = time.time() 67 | e_list = [[] for v in range(adj_gs.shape[0])] 68 | for v in range(adj_gs.shape[0]): 69 | n_list = adj_gs.indices[adj_gs.indptr[v]:adj_gs.indptr[v+1]] 70 | for n in n_list: 71 | e_list[n].append(v) 72 | e_list_full = [] 73 | gs_t_indptr = np.zeros(adj_gs.shape[0]+1).astype(np.int32) # indptr for adj_gs.T 74 | for i,el in enumerate(e_list): 75 | e_list_full.extend(sorted(el)) 76 | gs_t_indptr[i+1] = gs_t_indptr[i] + len(el) 77 | gs_t_indices = np.array(e_list_full).astype(np.int32) # indices for adj_gs.T 78 | # prepare I_edges here, after identifying the large-weight edges 79 | I_edges = dict() 80 | for (aggr1,aggr2) in M: 81 | # intersection of aggr1's neighbor and aggr2's neighbor 82 | _neigh1 = gs_t_indices[gs_t_indptr[aggr1]:gs_t_indptr[aggr1+1]] 83 | _neigh2 = gs_t_indices[gs_t_indptr[aggr2]:gs_t_indptr[aggr2+1]] 84 | I_edges[(aggr1,aggr2)] = np.intersect1d(_neigh1,_neigh2,assume_unique=True) 85 | for (aggr1,aggr2) in M: 86 | v_root = I_edges[(aggr1,aggr2)] 87 | ret_feat[num_v+idx] = ret_feat[aggr1]+ret_feat[aggr2] 88 | for v in v_root: 89 | neigh = adj_gs.indices[adj_gs.indptr[v]:adj_gs.indptr[v+1]] 90 | i1 = np.where(neigh==aggr1)[0][0] 91 | i2 = np.where(neigh==aggr2)[0][0] # searchsorted not applicable here since we insert -1 92 | adj_gs.indices[adj_gs.indptr[v]+i1] = num_v+idx 93 | adj_gs.indices[adj_gs.indptr[v]+i2] = -1 94 | deg[v] -= 1 95 | idx += 1 96 | _indptr_new = np.cumsum(deg) 97 | indptr_new = np.zeros(num_v+idx+1) 98 | indptr_new[1:num_v+1] = _indptr_new 99 | indptr_new[num_v+1:] = _indptr_new[-1] 100 | indices_new = adj_gs.indices[np.where(adj_gs.indices>-1)] 101 | assert indices_new.size == indptr_new[-1] 102 | data_new = np.ones(indices_new.size) 103 | ret_adj = sp.csr_matrix((data_new,indices_new,indptr_new),shape=(num_v+len(M),num_v+len(M))) 104 | return ret_adj, ret_feat 105 | 106 | 107 | f_tot_ops = lambda adj: adj.size-np.where(np.ediff1d(adj.indptr)>0)[0].size 108 | f_tot_read = lambda adj: adj.size#-np.where(np.ediff1d(adj.indptr)==1)[0].size 109 | max_deg = lambda adj: np.ediff1d(adj.indptr).max() 110 | mean_deg = lambda adj: np.ediff1d(adj.indptr).mean() 111 | sigma_deg2 = lambda adj: (np.ediff1d(adj.indptr)**2).sum()/adj.shape[0] 112 | 113 | 114 | def main(adj, num_round): 115 | adj_gs = sp.load_npz(adj) 116 | num_v_orig = adj_gs.shape[0] 117 | tot_ops_orig = f_tot_ops(adj_gs) 118 | tot_read_orig = f_tot_read(adj_gs) 119 | feat = np.random.rand(adj_gs.shape[0]) 120 | ground_truth = adj_gs@feat.reshape(-1,1) 121 | cnt_precompute = 0 122 | cnt_preread = 0 123 | for r in range(num_round): 124 | print("max deg: {}, avg deg: {:.2f}, (\Sigma deg^2)/|V|: {}".format(max_deg(adj_gs),mean_deg(adj_gs),sigma_deg2(adj_gs))) 125 | ops_prev = f_tot_ops(adj_gs) 126 | weight_edges,num_v = construct_ga(adj_gs) 127 | M,_W = obtain_precompute_edges(weight_edges,num_v) 128 | cnt_precompute += len(M) 129 | cnt_preread += 2*len(M) 130 | adj_gs,feat = obtain_compact_mat(adj_gs,M,feat) 131 | ops_new = f_tot_ops(adj_gs) + cnt_precompute 132 | read_new = f_tot_read(adj_gs) + cnt_preread 133 | print("previous ops: ", ops_prev) 134 | print("new ops: ", ops_new) 135 | print("match size: ",len(M)) 136 | print("reduction comp compared to original: {:.2f} (precompute {:.3f} of original total ops, temp buffer {:.3f}% of |V|)"\ 137 | .format(tot_ops_orig/ops_new,cnt_precompute/tot_ops_orig,cnt_precompute/num_v_orig*100)) 138 | print("reduction comm compared to original: {:.2f}".format(tot_read_orig/read_new)) 139 | optimized_result = adj_gs@feat.reshape(-1,1) 140 | np.testing.assert_allclose(ground_truth, optimized_result[:ground_truth.size], rtol=1e-8, atol=0) 141 | print("RESULT CORRECT!") 142 | 143 | 144 | if __name__ == '__main__': 145 | args = parse_args() 146 | main(args.adj, args.round) 147 | --------------------------------------------------------------------------------