├── data_demo └── pubmed │ ├── feats.npy │ └── labels.npz ├── Train ├── requirements.txt ├── run.sh ├── config │ ├── ppi.json │ ├── mag.json │ ├── paper.json │ ├── yelp.json │ ├── amazon2m.json │ ├── pubmed.json │ └── reddit.json ├── model.py ├── logger.py ├── run_node.py ├── loader.py └── data_processor.py ├── Precompute ├── script │ ├── run_mag.sh │ ├── run_paper.sh │ ├── run_pubmed.sh │ ├── run_reddit.sh │ ├── run_ppi.sh │ ├── run_yelp.sh │ └── run_amazon2m.sh ├── CMakeLists.txt ├── main.cpp ├── HelperFunctions.h ├── BasicDefinition.h ├── HelperFunctions.cpp ├── BatchRandomWalk.h ├── FeatureOp.h ├── MyType.h ├── FeatureDecomp.h ├── fastPRNG.h ├── npy.hpp ├── FeatureOp.cpp └── Graph.h ├── .gitignore ├── README.md ├── LICENSE.rtf └── demo.ipynb /data_demo/pubmed/feats.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gdmnl/SCARA-PPR/HEAD/data_demo/pubmed/feats.npy -------------------------------------------------------------------------------- /data_demo/pubmed/labels.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gdmnl/SCARA-PPR/HEAD/data_demo/pubmed/labels.npz -------------------------------------------------------------------------------- /Train/requirements.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | dotmap=1.3.26 4 | numpy=1.22 5 | pytorch=1.10.0 6 | scikit-learn=1.0.1 7 | scipy=1.7.3 8 | tqdm=4.62.3 9 | yaml=0.2.5 10 | zlib=1.2.11 11 | -------------------------------------------------------------------------------- /Train/run.sh: -------------------------------------------------------------------------------- 1 | DATASTR=amazon2m 2 | ALGOSTR=featpush 3 | for SEED in 0 1 2 4 | do 5 | OUTDIR=../save/${DATASTR}/${ALGOSTR}/${SEED} 6 | OUTFILE=${OUTDIR}/out_${SEED}.txt 7 | python -u run_node.py --seed ${SEED} --config ./config/${DATASTR}.json --dev ${1:--1} > ${OUTFILE} & 8 | echo $! && wait 9 | done 10 | -------------------------------------------------------------------------------- /Train/config/ppi.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": "ppi", 3 | "path": "../data/", 4 | "algo": "featpush", 5 | "epochs": 1000, 6 | "patience": 100, 7 | "batch": 2048, 8 | "lr": 0.005, 9 | "weight_decay": 0, 10 | "layer": 4, 11 | "hidden": 2048, 12 | "dropout": 0.1, 13 | "bias": "bn", 14 | "alpha": 0.3, 15 | "eps": 0.5, 16 | "rrz": 0.0, 17 | "inductive": true, 18 | "multil": true, 19 | "spt": 1 20 | } -------------------------------------------------------------------------------- /Train/config/mag.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": "mag", 3 | "path": "../data/", 4 | "algo": "featpush", 5 | "epochs": 500, 6 | "patience": 100, 7 | "batch": 512, 8 | "lr": 0.001, 9 | "weight_decay": 1e-05, 10 | "layer": 4, 11 | "hidden": 2048, 12 | "dropout": 0.3, 13 | "bias": "none", 14 | "alpha": 0.5, 15 | "eps": 16, 16 | "rrz": 0.5, 17 | "inductive": false, 18 | "multil": true, 19 | "spt": 1 20 | } -------------------------------------------------------------------------------- /Train/config/paper.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": "paper", 3 | "path": "../data/", 4 | "algo": "featpush", 5 | "epochs": 500, 6 | "patience": 50, 7 | "batch": 512, 8 | "lr": 0.001, 9 | "weight_decay": 0, 10 | "layer": 4, 11 | "hidden": 128, 12 | "dropout": 0.1, 13 | "bias": "none", 14 | "alpha": 0.5, 15 | "eps": 64, 16 | "rrz": 0.5, 17 | "inductive": false, 18 | "multil": false, 19 | "spt": 1 20 | } -------------------------------------------------------------------------------- /Train/config/yelp.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": "yelp", 3 | "path": "../data/", 4 | "algo": "featpush", 5 | "epochs": 1000, 6 | "patience": 100, 7 | "batch": 2048, 8 | "lr": 0.005, 9 | "weight_decay": 0, 10 | "layer": 4, 11 | "hidden": 2048, 12 | "dropout": 0.1, 13 | "bias": "biasbn", 14 | "alpha": 0.9, 15 | "eps": 16, 16 | "rrz": 0.3, 17 | "inductive": true, 18 | "multil": true, 19 | "spt": 1 20 | } -------------------------------------------------------------------------------- /Train/config/amazon2m.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": "amazon2m", 3 | "path": "../data/", 4 | "algo": "featpush", 5 | "epochs": 1000, 6 | "patience": 100, 7 | "batch": 2048, 8 | "lr": 0.005, 9 | "weight_decay": 0, 10 | "layer": 4, 11 | "hidden": 2048, 12 | "dropout": 0.1, 13 | "bias": "bn", 14 | "alpha": 0.2, 15 | "eps": 4, 16 | "rrz": 0.2, 17 | "inductive": true, 18 | "multil": false, 19 | "spt": 1 20 | } -------------------------------------------------------------------------------- /Train/config/pubmed.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": "pubmed", 3 | "path": "../data/", 4 | "algo": "featreuse", 5 | "epochs": 1000, 6 | "patience": 100, 7 | "batch": 64, 8 | "lr": 0.005, 9 | "weight_decay": 1e-4, 10 | "layer": 2, 11 | "hidden": 128, 12 | "dropout": 0.5, 13 | "bias": "none", 14 | "alpha": 0.1, 15 | "eps": 2, 16 | "rrz": 0.5, 17 | "inductive": false, 18 | "multil": false, 19 | "spt": 1 20 | } -------------------------------------------------------------------------------- /Train/config/reddit.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": "reddit", 3 | "path": "../data/", 4 | "algo": "featpush", 5 | "epochs": 1000, 6 | "patience": 100, 7 | "batch": 64, 8 | "lr": 0.005, 9 | "weight_decay": 1e-4, 10 | "layer": 4, 11 | "hidden": 128, 12 | "dropout": 0.5, 13 | "bias": "none", 14 | "alpha": 0.5, 15 | "eps": 64, 16 | "rrz": 0.5, 17 | "inductive": false, 18 | "multil": false, 19 | "spt": 1 20 | } -------------------------------------------------------------------------------- /Precompute/script/run_mag.sh: -------------------------------------------------------------------------------- 1 | DATASTR=mag 2 | ALGOSTR=featpush 3 | SEED=0 4 | DATADIR=../data/${DATASTR} 5 | SAVEDIR=../save/${DATASTR}/${ALGOSTR}/${SEED} 6 | mkdir -p ${SAVEDIR} 7 | ../Precompute/build/featpush -algo ${ALGOSTR} \ 8 | -data_folder ${DATADIR} -estimation_folder ${SAVEDIR} \ 9 | -graph adj.txt -feats feats_normt.npy \ 10 | -alpha 0.5 -epsilon 16 -thread_num 32 \ 11 | -seed ${SEED} > ${SAVEDIR}/pre_${SEED}.txt 12 | # ../Precompute/build/featpush -algo featpush -data_folder ../data/mag -feats feats_normt.npy -thread_num 1 -seed 7 -alpha 0.2 -epsilon 16 13 | -------------------------------------------------------------------------------- /Precompute/script/run_paper.sh: -------------------------------------------------------------------------------- 1 | DATASTR=paper 2 | ALGOSTR=featpush 3 | SEED=2 4 | DATADIR=../data/${DATASTR} 5 | SAVEDIR=../save/${DATASTR}/${ALGOSTR}/${SEED} 6 | mkdir -p ${SAVEDIR} 7 | ../Precompute/build/featpush -algo ${ALGOSTR} \ 8 | -data_folder ${DATADIR} -estimation_folder ${SAVEDIR} \ 9 | -graph adj.txt -feats feats_normt.npy \ 10 | -alpha 0.5 -epsilon 64 -thread_num 32 \ 11 | -seed ${SEED} > ${SAVEDIR}/pre_${SEED}.txt 12 | # ../Precompute/build/featpush -algo featpush -data_folder ../data/paper -feats feats_normt.npy -thread_num 14 -seed 7 -alpha 0.5 -epsilon 64 13 | -------------------------------------------------------------------------------- /Precompute/script/run_pubmed.sh: -------------------------------------------------------------------------------- 1 | DATASTR=pubmed 2 | ALGOSTR=featpush 3 | SEED=7 4 | DATADIR=../data/${DATASTR} 5 | SAVEDIR=../save/${DATASTR}/${ALGOSTR}/${SEED} 6 | mkdir -p ${SAVEDIR} 7 | ../Precompute/build/featpush -algo ${ALGOSTR} \ 8 | -data_folder ${DATADIR} -estimation_folder ${SAVEDIR} \ 9 | -graph adj.txt -feats feats_normt.npy \ 10 | -alpha 0.1 -epsilon 2 -thread_num 1 \ 11 | -seed ${SEED} > ${SAVEDIR}/pre_${SEED}.txt 12 | # ../Precompute/build/featpush -algo featpush -data_folder ../data/pubmed -graph adj.txt -feats feats_normt.npy -thread_num 1 -seed 0 -alpha 0.1 -epsilon 2 13 | # ../Precompute/build/featpush -algo featpca -data_folder ../data/pubmed -graph adj.txt -feats feats_normt.npy -thread_num 1 -seed 0 -alpha 0.1 -epsilon 2 14 | -------------------------------------------------------------------------------- /Precompute/script/run_reddit.sh: -------------------------------------------------------------------------------- 1 | DATASTR=reddit 2 | ALGOSTR=featpush 3 | DATADIR=../data/${DATASTR} 4 | for SEED in 0 1 2 5 | do 6 | SAVEDIR=../save/${DATASTR}/${ALGOSTR}/${SEED} 7 | mkdir -p ${SAVEDIR} 8 | ../Precompute/build/featpush -algo ${ALGOSTR} \ 9 | -data_folder ${DATADIR} -estimation_folder ${SAVEDIR} \ 10 | -graph adj.txt -feats feats_normt.npy \ 11 | -alpha 0.5 -epsilon 64 -thread_num 32 \ 12 | -seed ${SEED} > ${SAVEDIR}/pre_${SEED}.txt 13 | done 14 | # ../Precompute/build/featpush -algo featpush -data_folder ../data/reddit -feats feats_normt.npy -thread_num 1 -seed 7 -alpha 0.5 -epsilon 64 15 | # ../Precompute/build/featpush -algo featpca -data_folder ../data/reddit -feats feats_normt.npy -thread_num 1 -seed 7 -alpha 0.5 -epsilon 64 16 | -------------------------------------------------------------------------------- /Precompute/script/run_ppi.sh: -------------------------------------------------------------------------------- 1 | DATASTR=ppi 2 | ALGOSTR=featpush 3 | SEED=2 4 | DATADIR=../data/${DATASTR} 5 | SAVEDIR=../save/${DATASTR}/${ALGOSTR}/${SEED} 6 | mkdir -p ${SAVEDIR} 7 | ../Precompute/build/featpush -algo ${ALGOSTR} \ 8 | -data_folder ${DATADIR} -estimation_folder ${SAVEDIR} \ 9 | -graph adj.txt -feats feats_normt.npy \ 10 | -alpha 0.3 -epsilon 0.5 -thread_num 32 \ 11 | -seed ${SEED} > ${SAVEDIR}/pre_${SEED}.txt 12 | DATADIR=../data/${DATASTR}_train 13 | SAVEDIR=../save/${DATASTR}/${ALGOSTR}_train/${SEED} 14 | mkdir -p ${SAVEDIR} 15 | ../Precompute/build/featpush -algo ${ALGOSTR} \ 16 | -data_folder ${DATADIR} -estimation_folder ${SAVEDIR} \ 17 | -graph adj.txt -feats feats_normt.npy \ 18 | -alpha 0.3 -epsilon 0.5 -thread_num 32 \ 19 | -seed ${SEED} > ${SAVEDIR}/pre_${SEED}.txt 20 | # ../Precompute/build/featpush -algo featpush -data_folder ../data/ppi -feats feats_normt.npy -thread_num 1 -seed 7 -alpha 0.3 -epsilon 0.5 21 | # ../Precompute/build/featpush -algo featpca -data_folder ../data/ppi -feats feats_normt.npy -thread_num 1 -seed 7 -alpha 0.3 -epsilon 0.5 22 | -------------------------------------------------------------------------------- /Precompute/script/run_yelp.sh: -------------------------------------------------------------------------------- 1 | DATASTR=yelp 2 | ALGOSTR=featpush 3 | SEED=0 4 | DATADIR=../data/${DATASTR} 5 | SAVEDIR=../save/${DATASTR}/${ALGOSTR}/${SEED} 6 | mkdir -p ${SAVEDIR} 7 | ../Precompute/build/featpush -algo ${ALGOSTR} \ 8 | -data_folder ${DATADIR} -estimation_folder ${SAVEDIR} \ 9 | -graph adj.txt -feats feats_normt.npy \ 10 | -alpha 0.9 -epsilon 4 -thread_num 32 \ 11 | -seed ${SEED} > ${SAVEDIR}/pre_${SEED}.txt 12 | DATADIR=../data/${DATASTR}_train 13 | SAVEDIR=../save/${DATASTR}/${ALGOSTR}_train/${SEED} 14 | mkdir -p ${SAVEDIR} 15 | ../Precompute/build/featpush -algo ${ALGOSTR} \ 16 | -data_folder ${DATADIR} -estimation_folder ${SAVEDIR} \ 17 | -graph adj.txt -feats feats_normt.npy \ 18 | -alpha 0.9 -epsilon 4 -thread_num 32 \ 19 | -seed ${SEED} > ${SAVEDIR}/pre_${SEED}.txt 20 | # ../Precompute/build/featpush -algo featpush -data_folder ../data/yelp -feats feats_normt.npy -thread_num 1 -seed 7 -alpha 0.9 -epsilon 16 21 | # ../Precompute/build/featpush -algo featpca -data_folder ../data/yelp -feats feats_normt.npy -thread_num 1 -seed 7 -alpha 0.9 -epsilon 16 22 | -------------------------------------------------------------------------------- /Precompute/script/run_amazon2m.sh: -------------------------------------------------------------------------------- 1 | DATASTR=amazon2m 2 | ALGOSTR=featpush 3 | SEED=2 4 | DATADIR=../data/${DATASTR} 5 | SAVEDIR=../save/${DATASTR}/${ALGOSTR}/${SEED} 6 | mkdir -p ${SAVEDIR} 7 | ../Precompute/build/featpush -algo ${ALGOSTR} \ 8 | -data_folder ${DATADIR} -estimation_folder ${SAVEDIR} \ 9 | -graph adj.txt -feats feats_normt.npy \ 10 | -alpha 0.2 -epsilon 4 -thread_num 32 \ 11 | -seed ${SEED} > ${SAVEDIR}/pre_${SEED}.txt 12 | DATADIR=../data/${DATASTR}_train 13 | SAVEDIR=../save/${DATASTR}/${ALGOSTR}_train/${SEED} 14 | mkdir -p ${SAVEDIR} 15 | ../Precompute/build/featpush -algo ${ALGOSTR} \ 16 | -data_folder ${DATADIR} -estimation_folder ${SAVEDIR} \ 17 | -graph adj.txt -feats feats_normt.npy \ 18 | -alpha 0.2 -epsilon 4 -thread_num 32 \ 19 | -seed ${SEED} > ${SAVEDIR}/pre_${SEED}.txt 20 | # ../Precompute/build/featpush -algo featpush -data_folder ../data/amazon2m -feats feats_normt.npy -thread_num 1 -seed 7 -alpha 0.2 -epsilon 4 21 | # ../Precompute/build/featpush -algo featpca -data_folder ../data/amazon2m -feats feats_normt.npy -thread_num 1 -seed 7 -alpha 0.2 -epsilon 4 22 | -------------------------------------------------------------------------------- /Precompute/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | project(featpush) 3 | 4 | set(CMAKE_CXX_STANDARD 14) 5 | set(CMAKE_OSX_ARCHITECTURES "x86_64") 6 | set(CMAKE_CXX_FLAGS "-march=native -msse2 -msse -DHAVE_SSE2 -ffast-math -mfpmath=sse -pthread -std=c++14 -O3 -DNDEBUG") 7 | #set(CMAKE_CXX_FLAGS "-g -march=native -msse2 -msse -DHAVE_SSE2 -ffast-math -mfpmath=sse -pthread -std=c++14 -O3") 8 | #set(CMAKE_CXX_FLAGS "-march=native -ffast-math -use_fast_math -pthread -std=c++11 -O3 -DNDEBUG") 9 | #set(CMAKE_CXX_FLAGS "-march=native -ffast-math -use_fast_math -pthread -std=c++11") 10 | 11 | #include_directories( "/usr/local/include/eigen3" ) 12 | find_package(Eigen3 3.4 REQUIRED NO_MODULE) 13 | 14 | #add_executable(featpush main.cpp 15 | # npy.hpp FeatureDecomp.h fastPRNG.h 16 | # BasicDefinition.h Graph.h BatchRandomWalk.h 17 | # HelperFunctions.h HelperFunctions.cpp MyType.h 18 | # FeatureOp.cpp FeatureOp.h SpeedPPR.h) 19 | add_executable(featpush main.cpp 20 | npy.hpp FeatureDecomp.h 21 | BasicDefinition.h Graph.h 22 | HelperFunctions.h HelperFunctions.cpp MyType.h 23 | FeatureOp.cpp FeatureOp.h SpeedPPR.h) 24 | target_link_libraries(featpush Eigen3::Eigen) 25 | -------------------------------------------------------------------------------- /Precompute/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "HelperFunctions.h" 3 | #include "Graph.h" 4 | #include "SpeedPPR.h" 5 | #include "FeatureOp.cpp" 6 | 7 | #ifdef ENABLE_RW 8 | #include "BatchRandomWalk.h" 9 | 10 | XoshiroGenerator fRNG; 11 | 12 | XoshiroGenerator init_rng(uint64_t seed) { 13 | XoshiroGenerator rng; 14 | rng.initialize(seed); 15 | return rng; 16 | } 17 | #endif 18 | 19 | int main(int argc, char **argv) { 20 | param = parseArgs(argc, argv); 21 | std::srand(param.seed); 22 | #ifdef ENABLE_RW 23 | fRNG = init_rng(param.seed); 24 | #endif 25 | // Input graph 26 | Graph graph; 27 | graph.set_alpha(param.alpha); 28 | std::ifstream bin_file(param.data_folder + "/graph.bin"); 29 | if (!bin_file.good()) { 30 | CleanGraph cleaner; 31 | cleaner.clean_graph(param.graph_file, param.data_folder); 32 | } 33 | graph.read_binary(param.data_folder + "/attribute.txt", param.data_folder + "/graph.bin"); 34 | 35 | // Perfrom feature operations 36 | if (param.algorithm == "featpush"){ 37 | FeatProc proc(graph, param); 38 | proc.push(); 39 | proc.show_statistics(); 40 | } else if (param.algorithm == "featreuse") { 41 | FeatProc_greedy proc(graph, param); 42 | proc.fit(); 43 | proc.push(); 44 | proc.show_statistics(); 45 | } else if (param.algorithm == "featpca") { 46 | FeatProc_pca proc(graph, param); 47 | proc.fit(); 48 | proc.push(); 49 | proc.show_statistics(); 50 | } 51 | printf("%s\n", std::string(80, '-').c_str()); 52 | return 0; 53 | } 54 | -------------------------------------------------------------------------------- /Precompute/HelperFunctions.h: -------------------------------------------------------------------------------- 1 | /* 2 | Interface and IO 3 | Author: nyLiao 4 | */ 5 | #ifndef SCARA_HELPERFUNCTIONS_H 6 | #define SCARA_HELPERFUNCTIONS_H 7 | 8 | #define MSG(...) { cout << #__VA_ARGS__ << ": " << (__VA_ARGS__) << endl; } 9 | 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include "BasicDefinition.h" 25 | #ifdef __linux__ 26 | #include 27 | #endif 28 | 29 | 30 | // ==================== Runtime measurement 31 | extern double getCurrentTime(); 32 | 33 | inline float get_proc_memory(){ 34 | struct rusage r_usage; 35 | getrusage(RUSAGE_SELF,&r_usage); 36 | return r_usage.ru_maxrss/1000000.0; 37 | } 38 | 39 | inline float get_alloc_memory(){ 40 | struct mallinfo mi = mallinfo(); 41 | return mi.uordblks / 1000000000.0; 42 | } 43 | 44 | inline float get_stat_memory(){ 45 | long rss; 46 | std::string ignore; 47 | std::ifstream ifs("/proc/self/stat", std::ios_base::in); 48 | ifs >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore 49 | >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore >> ignore 50 | >> ignore >> ignore >> ignore >> rss; 51 | 52 | long page_size_kb = sysconf(_SC_PAGE_SIZE) / 1024; 53 | return rss * page_size_kb / 1000000.0; 54 | } 55 | 56 | // ==================== Argument parsing 57 | struct Param { 58 | std::string graph_file; 59 | std::string query_file; 60 | std::string feature_file; 61 | std::string algorithm = "featpush"; 62 | std::string data_folder; 63 | std::string estimation_folder; 64 | unsigned int thread_num = 1; 65 | unsigned int seed = 0; 66 | float epsilon = 0.5; 67 | float alpha = 0.2; 68 | float gamma = 0.2; 69 | float base_ratio = 0.04; 70 | bool index = false; 71 | bool output_estimations = false; 72 | }; 73 | 74 | extern Param param; 75 | 76 | extern Param parseArgs(int nargs, char **args); 77 | 78 | // ==================== IO 79 | inline size_t load_query(IntVector &Vt_nodes, std::string query_path, const NInt &V_num){ 80 | // By default use all nodes 81 | if (query_path.empty()) { 82 | Vt_nodes.resize(V_num); 83 | std::iota(Vt_nodes.begin(), Vt_nodes.end(), 0); 84 | } else { 85 | std::ifstream query_file(query_path); 86 | if (query_file.good() == false) { 87 | printf("File Not Exists.\n"); 88 | exit(1); 89 | } 90 | for (NInt sid; (query_file >> sid);) { 91 | Vt_nodes.emplace_back(sid); 92 | } 93 | if (Vt_nodes.empty()) { 94 | printf("Error! Empty File\n"); 95 | } 96 | query_file.close(); 97 | } 98 | 99 | cout << "Query size: " << Vt_nodes.size() << endl; 100 | return Vt_nodes.size(); 101 | } 102 | 103 | #endif //SCARA_HELPERFUNCTIONS_H 104 | -------------------------------------------------------------------------------- /Precompute/BasicDefinition.h: -------------------------------------------------------------------------------- 1 | /* 2 | Type and class definitions 3 | Author: nyLiao 4 | */ 5 | #ifndef SCARA_BASICDEFINITION_H 6 | #define SCARA_BASICDEFINITION_H 7 | 8 | // #ifndef ENABLE_RW 9 | // #define ENABLE_RW 10 | // #endif 11 | // #ifndef ENABLE_INITTH 12 | // #define ENABLE_INITTH 13 | // #endif 14 | // #ifndef ENABLE_PI 15 | // #define ENABLE_PI 16 | // #endif 17 | // #ifndef DEBUG 18 | // #define DEBUG 19 | // #endif 20 | 21 | #include 22 | #include 23 | #include 24 | 25 | using std::cout; 26 | using std::endl; 27 | 28 | #define IDFMT "lu" // NInt print format 29 | typedef unsigned long NInt; // Type of Node / Edge size 30 | typedef float ScoreFlt; // Type of PPR Score 31 | typedef std::vector IntVector; 32 | typedef std::vector FltVector; 33 | 34 | template 35 | struct IdScorePair { 36 | NInt id = 0; 37 | FLT score = 0; 38 | 39 | IdScorePair(const NInt &_id = 0, const FLT &_score = 0) : 40 | id(_id), score(_score) {} 41 | }; 42 | 43 | template 44 | struct IdScorePairComparatorGreater { 45 | // Compare 2 IdScorePair objects using name 46 | bool operator()(const IdScorePair &pair1, const IdScorePair &pair2) { 47 | return pair1.score > pair2.score || pair1.score == pair2.score && pair1.id < pair2.id; 48 | } 49 | }; 50 | 51 | template 52 | struct IdScorePairComparatorLess { 53 | // Compare 2 IdScorePair objects using name 54 | bool operator()(const IdScorePair &pair1, const IdScorePair &pair2) { 55 | return pair1.score < pair2.score || pair1.score == pair2.score && pair1.id < pair2.id; 56 | } 57 | }; 58 | 59 | struct Edge { 60 | NInt from_id; 61 | NInt to_id; 62 | 63 | Edge() : from_id(0), to_id(0) {} 64 | 65 | Edge(const NInt &_from, const NInt &_to) : 66 | from_id(_from), to_id(_to) {} 67 | 68 | bool operator<(const Edge &_edge) const { 69 | return from_id < _edge.from_id || (from_id == _edge.from_id && to_id < _edge.to_id); 70 | } 71 | }; 72 | 73 | class VertexQueue { 74 | private: 75 | const NInt mask; 76 | IntVector queue; 77 | NInt num = 0; 78 | NInt idx_front = 0; 79 | NInt idx_last_plus_one = 0; 80 | private: 81 | static inline NInt compute_queue_size(const NInt &_numOfVertices) { 82 | return (1u) << (uint32_t) ceil(log2(_numOfVertices + 2u)); 83 | } 84 | 85 | public: 86 | explicit VertexQueue(const NInt &_numOfVertices) : 87 | mask(compute_queue_size(_numOfVertices) - 1), 88 | queue(mask + 2u, 0) {} 89 | 90 | inline void clear() { 91 | idx_front = 0; 92 | idx_last_plus_one = 0; 93 | num = 0; 94 | } 95 | 96 | inline const NInt &size() const { return num; } 97 | 98 | inline const NInt &front() const { return queue[idx_front]; } 99 | 100 | inline void pop() { 101 | --num; 102 | ++idx_front; 103 | idx_front &= mask; 104 | } 105 | 106 | inline void push(const NInt &_elem) { 107 | ++num; 108 | queue[idx_last_plus_one] = _elem; 109 | ++idx_last_plus_one; 110 | idx_last_plus_one &= mask; 111 | } 112 | 113 | inline bool empty() const { 114 | return idx_last_plus_one == idx_front; 115 | } 116 | }; 117 | 118 | #endif //SCARA_BASICDEFINITION_H 119 | -------------------------------------------------------------------------------- /Train/model.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | 6 | 7 | class ResLinear(nn.Module): 8 | def __init__(self, in_features, out_features, ftransform='none'): 9 | super(ResLinear, self).__init__() 10 | self.in_features = in_features 11 | self.out_features = out_features 12 | self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features)) 13 | self.bias = None 14 | if ftransform == 'bn': 15 | self.trans_fn = nn.BatchNorm1d(out_features) 16 | elif ftransform == 'bias': 17 | self.bias = nn.Parameter(torch.FloatTensor(out_features)) 18 | self.trans_fn = lambda x: x + self.bias 19 | elif ftransform == 'biasbn': 20 | self.bias = nn.Parameter(torch.FloatTensor(out_features)) 21 | self.bn = nn.BatchNorm1d(out_features) 22 | self.trans_fn = lambda x: self.bn(x + self.bias) 23 | else: 24 | self.trans_fn = lambda x: x 25 | 26 | self.reset_parameters() 27 | 28 | def reset_parameters(self): 29 | stdv = 1. / math.sqrt(self.weight.size(1)) 30 | self.weight.data.uniform_(-stdv, stdv) 31 | if self.bias is not None: 32 | self.bias.data.zero_() 33 | 34 | def forward(self, input): 35 | output = torch.mm(input, self.weight) 36 | output = self.trans_fn(output) 37 | # Residual connection 38 | if self.in_features == self.out_features: 39 | output += input 40 | return output 41 | 42 | 43 | class Dense(nn.Module): 44 | def __init__(self, nfeat, nlayers, nhidden, nclass, dropout, bias): 45 | super(Dense, self).__init__() 46 | self.fcs = nn.ModuleList() 47 | self.fcs.append(ResLinear(nfeat, nhidden, bias)) 48 | for _ in range(nlayers-2): 49 | self.fcs.append(ResLinear(nhidden, nhidden, bias)) 50 | self.fcs.append(ResLinear(nhidden, nclass)) 51 | self.act_fn = nn.ReLU() 52 | self.dropout = dropout 53 | 54 | def forward(self, x): 55 | x = F.dropout(x, self.dropout, training=self.training) 56 | x = self.act_fn(self.fcs[0](x)) 57 | for fc in self.fcs[1:-1]: 58 | x = F.dropout(x, self.dropout, training=self.training) 59 | x = self.act_fn(fc(x)) 60 | x = F.dropout(x, self.dropout, training=self.training) 61 | x = self.fcs[-1](x) 62 | return x 63 | 64 | 65 | class DenseSkip(nn.Module): 66 | def __init__(self, nfeat, nlayers, nhidden, nclass, dropout, bias): 67 | super(DenseSkip, self).__init__() 68 | self.fcs = nn.ModuleList() 69 | self.fcs.append(ResLinear(nfeat, nhidden, bias)) 70 | for _ in range(nlayers-2): 71 | self.fcs.append(ResLinear(nhidden, nhidden, bias)) 72 | self.fcs.append(ResLinear(nhidden, nclass)) 73 | self.act_fn = nn.ReLU() 74 | self.dropout = dropout 75 | 76 | def forward(self, x): 77 | out = F.dropout(x, self.dropout, training=self.training) 78 | # Shortcut connection of input features 79 | out1 = self.fcs[0](out) 80 | out = self.act_fn(out1) 81 | for fc in self.fcs[1:-1]: 82 | out = F.dropout(out, self.dropout, training=self.training) 83 | out = fc(out) 84 | out += out1 85 | out = self.act_fn(out) 86 | out = F.dropout(out, self.dropout, training=self.training) 87 | out = self.fcs[-1](out) 88 | return out 89 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # This repo 2 | wheels/* 3 | save 4 | save/* 5 | backup/* 6 | run/* 7 | data 8 | data/* 9 | */result/ 10 | !**/Makefile 11 | 12 | # my ignore 13 | try* 14 | .vscode/ 15 | .DS_Store 16 | 17 | # CMake 18 | CMakeLists.txt.user 19 | CMakeCache.txt 20 | CMakeFiles 21 | CMakeScripts 22 | Testing 23 | Makefile 24 | cmake_install.cmake 25 | install_manifest.txt 26 | compile_commands.json 27 | CTestTestfile.cmake 28 | _deps 29 | 30 | # Prerequisites 31 | *.d 32 | 33 | # Compiled Object files 34 | *.slo 35 | *.lo 36 | *.o 37 | *.obj 38 | 39 | # Precompiled Headers 40 | *.gch 41 | *.pch 42 | 43 | # Compiled Dynamic libraries 44 | *.so 45 | *.dylib 46 | *.dll 47 | 48 | # Fortran module files 49 | *.mod 50 | *.smod 51 | 52 | # Compiled Static libraries 53 | *.lai 54 | *.la 55 | *.a 56 | *.lib 57 | 58 | # Executables 59 | *.exe 60 | *.out 61 | *.app 62 | 63 | # Byte-compiled / optimized / DLL files 64 | __pycache__/ 65 | *.py[cod] 66 | *$py.class 67 | 68 | # C extensions 69 | *.so 70 | 71 | # Distribution / packaging 72 | .Python 73 | build/ 74 | develop-eggs/ 75 | dist/ 76 | downloads/ 77 | eggs/ 78 | .eggs/ 79 | lib/ 80 | lib64/ 81 | parts/ 82 | sdist/ 83 | var/ 84 | wheels/ 85 | pip-wheel-metadata/ 86 | share/python-wheels/ 87 | *.egg-info/ 88 | .installed.cfg 89 | *.egg 90 | MANIFEST 91 | 92 | # PyInstaller 93 | # Usually these files are written by a python script from a template 94 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 95 | *.manifest 96 | *.spec 97 | 98 | # Installer logs 99 | pip-log.txt 100 | pip-delete-this-directory.txt 101 | 102 | # Unit test / coverage reports 103 | htmlcov/ 104 | .tox/ 105 | .nox/ 106 | .coverage 107 | .coverage.* 108 | .cache 109 | nosetests.xml 110 | coverage.xml 111 | *.cover 112 | *.py,cover 113 | .hypothesis/ 114 | .pytest_cache/ 115 | cover/ 116 | 117 | # Translations 118 | *.mo 119 | *.pot 120 | 121 | # Django stuff: 122 | *.log 123 | local_settings.py 124 | db.sqlite3 125 | db.sqlite3-journal 126 | 127 | # Flask stuff: 128 | instance/ 129 | .webassets-cache 130 | 131 | # Scrapy stuff: 132 | .scrapy 133 | 134 | # Sphinx documentation 135 | docs/_build/ 136 | 137 | # PyBuilder 138 | .pybuilder/ 139 | target/ 140 | 141 | # Jupyter Notebook 142 | .ipynb_checkpoints 143 | 144 | # IPython 145 | profile_default/ 146 | ipython_config.py 147 | 148 | # pyenv 149 | # For a library or package, you might want to ignore these files since the code is 150 | # intended to run in multiple environments; otherwise, check them in: 151 | # .python-version 152 | 153 | # pipenv 154 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 155 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 156 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 157 | # install all needed dependencies. 158 | #Pipfile.lock 159 | 160 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 161 | __pypackages__/ 162 | 163 | # Celery stuff 164 | celerybeat-schedule 165 | celerybeat.pid 166 | 167 | # SageMath parsed files 168 | *.sage.py 169 | 170 | # Environments 171 | .env 172 | .venv 173 | env/ 174 | venv/ 175 | ENV/ 176 | env.bak/ 177 | venv.bak/ 178 | 179 | # Spyder project settings 180 | .spyderproject 181 | .spyproject 182 | 183 | # Rope project settings 184 | .ropeproject 185 | 186 | # mkdocs documentation 187 | /site 188 | 189 | # mypy 190 | .mypy_cache/ 191 | .dmypy.json 192 | dmypy.json 193 | 194 | # Pyre type checker 195 | .pyre/ 196 | 197 | # pytype static type analyzer 198 | .pytype/ 199 | 200 | # Cython debug symbols 201 | cython_debug/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SCARA-PPR 2 | This is the original code for *"SCARA: Scalable Graph Neural Networks with Feature-Oriented Optimization"* (VLDB 2022) and *"Scalable Decoupling Graph Neural Networks with Feature-Oriented Optimization"* (VLDBJ 2023). 3 | 4 | [Paper - VLDB](https://www.vldb.org/pvldb/vol15/p3240-liao.pdf) | 5 | [Paper - VLDBJ](https://link.springer.com/article/10.1007/s00778-023-00829-6) | 6 | [GitHub](https://github.com/gdmnl/SCARA-PPR) | 7 | [Tech Report](https://sites.google.com/view/scara-techreport) | 8 | [arXiv](https://arxiv.org/abs/2207.09179) 9 | 10 | ## Citation 11 | 12 | If you find this work useful, please cite our papers: 13 | ### VLDBJ: 14 | > Ningyi Liao, Dingheng Mo, Siqiang Luo, Xiang Li, and Pengcheng Yin. 15 | > Scalable Decoupling Graph Neural Networks with Feature-Oriented Optimization. 16 | > The VLDB Journal, 33, 2023. doi:10.1007/s00778-023-00829-6. 17 | ``` 18 | @article{liao2023scalable, 19 | title={Scalable Decoupling Graph Neural Networks with Feature-Oriented Optimization}, 20 | author={Liao, Ningyi and Mo, Dingheng and Luo, Siqiang and Li, Xiang and Yin, Pengcheng}, 21 | journal={The {VLDB} Journal}, 22 | volume={33}, 23 | year={2023}, 24 | publisher={Springer}, 25 | url={https://link.springer.com/article/10.1007/s00778-023-00829-6}, 26 | doi={10.1007/s10994-021-06049-9} 27 | } 28 | ``` 29 | 30 | ### VLDB: 31 | > Ningyi Liao, Dingheng Mo, Siqiang Luo, Xiang Li, and Pengcheng Yin. 32 | > SCARA: Scalable Graph Neural Networks with Feature-Oriented Optimization. 33 | > PVLDB, 15(11): 3240-3248, 2022. doi:10.14778/3551793.3551866. 34 | ``` 35 | @article{liao2022scara, 36 | title={{SCARA}: Scalable Graph Neural Networks with Feature-Oriented Optimization}, 37 | author={Liao, Ningyi and Mo, Dingheng and Luo, Siqiang and Li, Xiang and Yin, Pengcheng}, 38 | journal={Proceedings of the VLDB Endowment}, 39 | volume={15}, 40 | number={11}, 41 | pages={3240-3248}, 42 | year={2022}, 43 | publisher={VLDB Endowment}, 44 | url = {https://doi.org/10.14778/3551793.3551866}, 45 | } 46 | ``` 47 | 48 | ## Usage 49 | **We provide a complete example and its log in the [demo notebook](demo.ipynb). The sample PubMed dataset is available in the [data folder](data_demo/pubmed/).** 50 | 51 | ### Data Preparation 52 | 1. Download data (links [below](#dataset-link)) in GBP format to path `data/[dataset_name]`. Similar to the PubMed dataset example, there are three files: 53 | * `adj.txt`: adjacency table 54 | * First line: "`# [number of nodes]`" 55 | * `feats.npy`: features in .npy array 56 | * `labels.npz`: node label information 57 | * 'label': labels (number or one-hot) 58 | * 'idx_train/idx_val/idx_test': indices of training/validation/test nodes (inductive task) 59 | 2. Run command `python data_processor.py` to generate additional processed files: 60 | * `degrees.npz`: node degrees in .npz 'arr_0' 61 | * `feats_norm.npy`: normalized features in .npy array 62 | * Large matrix can be split 63 | * `query.txt`: indices of queried nodes 64 | 65 | ### Precompute 66 | 1. Environment: CMake 3.16, C++ 14. Dependencies: [eigen3](https://eigen.tuxfamily.org/index.php?title=Main_Page) 67 | 2. CMake `cmake -B build`, then `make` 68 | 3. Run script: `./run_pubmed.sh` 69 | 70 | ### Train and Test 71 | 1. Install dependencies: `conda create --name [envname] --file requirements.txt` 72 | 2. Run experiment: `python run.py -f [seed] -c [config_file] -v [device]` 73 | 74 | ## Baseline Models 75 | * GraphSAINT: [GraphSAINT](https://github.com/GraphSAINT/GraphSAINT) 76 | * APPNP: [APPNP](https://github.com/benedekrozemberczki/APPNP) 77 | * PPRGo: [PPRGo](https://github.com/TUM-DAML/pprgo_pytorch) 78 | * GBP: [GBP](https://github.com/chennnM/GBP) 79 | * AGP: [AGP](https://github.com/wanghzccls/AGP-Approximate_Graph_Propagation) 80 | * GAS: [GAS](https://github.com/rusty1s/pyg_autoscale) 81 | 82 | ## Dataset Links 83 | * Citeseer & Pubmed: [GBP](https://github.com/chennnM/GBP) 84 | * PPI: [GraphSAGE](http://snap.stanford.edu/graphsage/) 85 | * Yelp: [GraphSAINT](https://github.com/GraphSAINT/GraphSAINT) 86 | * Reddit: [PPRGo](https://github.com/TUM-DAML/pprgo_pytorch) 87 | * Products & Papers100M: [OGB](https://github.com/snap-stanford/ogb) 88 | * Amazon: [Cluster-GCN](http://manikvarma.org/downloads/XC/XMLRepository.html) 89 | * MAG: [PANE](https://renchi.ac.cn/datasets/) 90 | -------------------------------------------------------------------------------- /Precompute/HelperFunctions.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "HelperFunctions.h" 4 | 5 | 6 | // ==================== Runtime measurement 7 | double getCurrentTime() { 8 | long long time = std::chrono::duration_cast( 9 | std::chrono::steady_clock::now().time_since_epoch()).count(); 10 | return static_cast(time) / 1000000.0; 11 | // return clock() / (double) CLOCKS_PER_SEC; 12 | } 13 | 14 | // ==================== Argument parsing 15 | void ltrim(std::string &_str, const std::string &_chars = "\t\n\v\f\r ") { 16 | _str.erase(0, _str.find_first_not_of(_chars)); 17 | } 18 | 19 | void rtrim(std::string &_str, const std::string &_chars = "\t\n\v\f\r ") { 20 | _str.erase(_str.find_last_not_of(_chars) + 1); 21 | } 22 | 23 | void trim(std::string &_str, const std::string &_chars = "\t\n\v\f\r ") { 24 | ltrim(_str, _chars); 25 | rtrim(_str, _chars); 26 | } 27 | 28 | /* Get the index of next unblank char from a string. */ 29 | unsigned int getNextChar(const char *str) { 30 | unsigned int rtn = 0; 31 | // Jump over all blanks 32 | for (; str[rtn] == ' '; ++rtn); 33 | return rtn; 34 | } 35 | 36 | /* Get next word from a string. */ 37 | std::string getNextWord(const char *str) { 38 | // Jump over all blanks 39 | std::string rtn(str); 40 | trim(rtn); 41 | return rtn; 42 | } 43 | 44 | Param param; 45 | 46 | Param parseArgs(int nargs, char **args) { 47 | std::vector Algorithms{ 48 | "featpush", "featreuse", "featpca", 49 | }; 50 | 51 | Param rtn; 52 | std::unordered_set algo_set(Algorithms.begin(), Algorithms.end()); 53 | 54 | printf("%s\n", std::string(80, '-').c_str()); 55 | printf("Configs:\n"); 56 | for (unsigned int cnt = 1; cnt < nargs;) { 57 | char *arg = args[cnt++]; 58 | if (cnt == nargs) { 59 | printf("Unknown Parameters.\n"); 60 | exit(0); 61 | } 62 | unsigned int i = getNextChar(arg); 63 | if (arg[i] != '-') { 64 | printf("Unknown Parameters.\n"); 65 | exit(0); 66 | } 67 | std::string para = getNextWord(arg + i + 1); 68 | // printf("-%s\n", para.c_str()); 69 | printf("\t"); 70 | arg = args[cnt++]; 71 | if (para == "algo") { 72 | rtn.algorithm = std::string(arg); 73 | cout << "Algorithm Parameter: " << rtn.algorithm << endl; 74 | if (algo_set.find(rtn.algorithm) == algo_set.end()) { 75 | printf("Unknown Algorithm.\n"); 76 | exit(0); 77 | } 78 | } else if (para == "data_folder") { 79 | rtn.data_folder = getNextWord(arg); 80 | printf("Data Folder: %s\n", rtn.data_folder.c_str()); 81 | } else if (para == "estimation_folder") { 82 | rtn.output_estimations = true; 83 | rtn.estimation_folder = getNextWord(arg); 84 | printf("Estimation Folder: %s\n", rtn.estimation_folder.c_str()); 85 | } else if (para == "graph") { 86 | rtn.graph_file = getNextWord(arg); 87 | if (!rtn.data_folder.empty()) 88 | rtn.graph_file = rtn.data_folder + "/" + rtn.graph_file; 89 | printf("Input Graph File: %s\n", rtn.graph_file.c_str()); 90 | } else if (para == "query") { 91 | rtn.query_file = getNextWord(arg); 92 | if (!rtn.data_folder.empty()) 93 | rtn.query_file = rtn.data_folder + "/" + rtn.query_file; 94 | printf("Input Query File: %s\n", rtn.query_file.c_str()); 95 | } else if (para == "feats") { 96 | rtn.feature_file = getNextWord(arg); 97 | if (!rtn.data_folder.empty()) 98 | rtn.feature_file = rtn.data_folder + "/" + rtn.feature_file; 99 | printf("Feature File: %s\n", rtn.feature_file.c_str()); 100 | } else if (para == "index") { 101 | auto option = getNextWord(arg); 102 | if (option == "yes") { 103 | rtn.index = true; 104 | } else if (option == "no") { 105 | rtn.index = false; 106 | } else { 107 | printf("Unknown option -%s!\n", option.c_str()); 108 | exit(0); 109 | } 110 | cout << "With Index: " << rtn.index << "\n"; 111 | } else if (para == "seed") { 112 | rtn.seed = std::stoi(getNextWord(arg)); 113 | printf("Random Seed: %d\n", rtn.seed); 114 | } else if (para == "thread_num") { 115 | rtn.thread_num = std::stoi(getNextWord(arg)); 116 | printf("Number of threads: %d\n", rtn.thread_num); 117 | } else if (para == "epsilon") { 118 | rtn.epsilon = std::stod(getNextWord(arg)); 119 | printf("Epsilon: %.4f\n", rtn.epsilon); 120 | } else if (para == "alpha") { 121 | rtn.alpha = std::stod(getNextWord(arg)); 122 | printf("Alpha: %.4f\n", rtn.alpha); 123 | } else if (para == "gamma") { 124 | rtn.gamma = std::stod(getNextWord(arg)); 125 | printf("Gamma: %.4f\n", rtn.gamma); 126 | } else if (para == "base_ratio") { 127 | rtn.base_ratio = std::stod(getNextWord(arg)); 128 | printf("Base ratio: %.4f\n", rtn.base_ratio); 129 | } else { 130 | printf("Unknown option -%s!\n\n", para.c_str()); 131 | exit(0); 132 | } 133 | } 134 | #ifdef ENABLE_RW 135 | printf("ON: ENABLE_RW\n"); 136 | #endif 137 | #ifdef ENABLE_PI 138 | printf("ON: ENABLE_PI\n"); 139 | #endif 140 | #ifdef ENABLE_INITTH 141 | printf("ON: ENABLE_INITTH\n"); 142 | #endif 143 | printf("%s\n", std::string(80, '-').c_str()); 144 | return rtn; 145 | } 146 | -------------------------------------------------------------------------------- /Train/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | from abc import ABC 3 | from datetime import datetime 4 | import uuid 5 | import json 6 | from dotmap import DotMap 7 | import torch 8 | 9 | 10 | def prepare_opt(parser): 11 | # Parser to dict 12 | opt_parser = vars(parser.parse_args()) 13 | # Config file to dict 14 | config_path = opt_parser['config'] 15 | with open(config_path, 'r') as config_file: 16 | opt_config = json.load(config_file) 17 | # Merge dicts to dotmap 18 | return DotMap({**opt_parser, **opt_config}) 19 | 20 | 21 | class Logger(ABC): 22 | def __init__(self, data, algo, flag_run=''): 23 | super(Logger, self).__init__() 24 | 25 | # init log directory 26 | self.seed_str = str(uuid.uuid4())[:6] 27 | self.seed = int(self.seed_str, 16) 28 | if not flag_run: 29 | flag_run = datetime.now().strftime("%m%d") + '-' + self.seed_str 30 | elif flag_run.count('date') > 0: 31 | flag_run.replace('date', datetime.now().strftime("%m%d")) 32 | else: 33 | pass 34 | self.dir_save = os.path.join("../save/", data, algo, flag_run) 35 | 36 | self.path_exists = os.path.exists(self.dir_save) 37 | os.makedirs(self.dir_save, exist_ok=True) 38 | 39 | # init log file 40 | self.flag_run = flag_run 41 | self.file_log = self.path_join('log.txt') 42 | self.file_config = self.path_join('config.json') 43 | 44 | def path_join(self, *args): 45 | """ 46 | Generate file path in current directory. 47 | """ 48 | return os.path.join(self.dir_save, *args) 49 | 50 | def print(self, s): 51 | """ 52 | Print string to console and write log file. 53 | """ 54 | print(s, flush=True) 55 | with open(self.file_log, 'a') as f: 56 | f.write(str(s) + '\n') 57 | 58 | def print_on_top(self, s): 59 | """ 60 | Print string on top of log file. 61 | """ 62 | print(s) 63 | with open(self.file_log, 'a') as f: 64 | pass 65 | with open(self.file_log, 'r+') as f: 66 | temp = f.read() 67 | f.seek(0, 0) 68 | f.write(str(s) + '\n') 69 | f.write(temp) 70 | 71 | def save_opt(self, opt): 72 | with open(self.file_config, 'a') as f: 73 | json.dump(opt.toDict(), fp=f, indent=4, sort_keys=False) 74 | f.write('\n') 75 | print("Option saved.") 76 | print("Config path: {}".format(self.file_config)) 77 | print("Option dict: {}\n".format(opt.toDict())) 78 | 79 | def load_opt(self): 80 | with open(self.file_config, 'r') as config_file: 81 | opt = DotMap(json.load(config_file)) 82 | print("Option loaded.") 83 | print("Config path: {}".format(self.file_config)) 84 | print("Option dict: {}\n".format(opt.toDict())) 85 | return opt 86 | 87 | 88 | class ModelLogger(ABC): 89 | """ 90 | Log, save, and load model, with given path, certain prefix, and changable suffix. 91 | """ 92 | def __init__(self, logger, prefix='model', state_only=False): 93 | super(ModelLogger, self).__init__() 94 | self.logger = logger 95 | self.prefix = prefix 96 | self.state_only = state_only 97 | self.model = None 98 | 99 | @property 100 | def state_dict(self): 101 | return self.model.state_dict() 102 | 103 | def __set_model(self, model): 104 | self.model = model 105 | return self.model 106 | 107 | def regi_model(self, model, save_init=True): 108 | """ 109 | Get model from parameters. 110 | 111 | Args: 112 | model: model instance 113 | save_init (bool, optional): Whether save initial model. Defaults to True. 114 | """ 115 | self.__set_model(model) 116 | if save_init: 117 | self.save('0') 118 | 119 | def load_model(self, *suffix, model=None): 120 | """ 121 | Get model from file. 122 | """ 123 | name = '_'.join((self.prefix,) + suffix) 124 | path = self.logger.path_join(name + '.pth') 125 | 126 | if self.state_only: 127 | if model is None: 128 | model = self.model 129 | state_dict = torch.load(path, map_location='cpu') 130 | model.load_state_dict(state_dict) 131 | else: 132 | model = torch.load(path, map_location='cpu') 133 | return self.__set_model(model) 134 | 135 | def get_last_epoch(self): 136 | """ 137 | Get last saved model epoch. 138 | 139 | Returns: 140 | int: number of last epoch 141 | """ 142 | name_pre = '_'.join((self.prefix,) + ('',)) 143 | last_epoch = -2 144 | 145 | for fname in os.listdir(self.logger.dir_save): 146 | fname = str(fname) 147 | if fname.startswith(name_pre) and fname.endswith('.pth'): 148 | suffix = fname.replace(name_pre, '').replace('.pth', '') 149 | if suffix == 'init': 150 | this_epoch = -1 151 | elif suffix.isdigit(): 152 | # correct the `epoch + 1` in `save_epoch()` 153 | this_epoch = int(suffix) - 1 154 | else: 155 | this_epoch = -2 156 | if this_epoch > last_epoch: 157 | last_epoch = this_epoch 158 | return last_epoch 159 | 160 | def save(self, *suffix): 161 | """ 162 | Save model with given name string. 163 | """ 164 | name = '_'.join((self.prefix,) + suffix) 165 | path = self.logger.path_join(name + '.pth') 166 | 167 | if self.state_only: 168 | torch.save(self.state_dict, path) 169 | else: 170 | torch.save(self.model, path) 171 | 172 | def save_epoch(self, epoch, period=1): 173 | """ 174 | Save model each epoch period. 175 | 176 | Args: 177 | epoch (int): Current epoch. Start from 0 (display as epoch + 1). 178 | period (int, optional): Save period. Defaults to 1 (save every epochs). 179 | """ 180 | if (epoch + 1) % period == 0: 181 | self.save(str(epoch+1)) 182 | 183 | def save_best(self, acc_curr, epoch=-1, print_log=True): 184 | """ 185 | Save model with best accuracy. 186 | 187 | Args: 188 | acc_curr (int/float): Current accuracy. 189 | """ 190 | is_best = False 191 | if not hasattr(self, 'acc_best'): 192 | self.acc_best = acc_curr 193 | self.epoch_best = epoch 194 | is_best = True 195 | if acc_curr > self.acc_best: 196 | self.acc_best = acc_curr 197 | self.epoch_best = epoch 198 | self.save('best') 199 | is_best = True 200 | 201 | if print_log: 202 | self.logger.print('[best saved] accuracy: {:>.4f}'.format(self.acc_best)) 203 | 204 | return is_best 205 | -------------------------------------------------------------------------------- /Train/run_node.py: -------------------------------------------------------------------------------- 1 | # Ref: https://github.com/chennnM/GBP 2 | import time 3 | import random 4 | import argparse 5 | import resource 6 | import numpy as np 7 | from sklearn.metrics import f1_score 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.optim as optim 12 | import torch.utils.data as Data 13 | 14 | from logger import Logger, ModelLogger, prepare_opt 15 | from loader import load_node_data 16 | from model import Dense, DenseSkip 17 | 18 | 19 | # Training settings 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('-f', '--seed', type=int, default=0, help='random seed.') 22 | parser.add_argument('-c', '--config', default='./config/reddit.json', help='config path.') 23 | parser.add_argument('-v', '--dev', type=int, default=-1, help='device id.') 24 | args = prepare_opt(parser) 25 | 26 | num_thread = 32 27 | random.seed(args.seed) 28 | np.random.seed(args.seed) 29 | torch.manual_seed(args.seed) 30 | if args.dev >= 0: 31 | torch.cuda.manual_seed(args.seed) 32 | 33 | print('-' * 20) 34 | flag_run = str(args.seed) 35 | logger = Logger(args.data, args.algo, flag_run=flag_run) 36 | logger.save_opt(args) 37 | model_logger = ModelLogger(logger, state_only=True) 38 | 39 | feat, labels, idx = load_node_data(args.algo, datastr=args.data, datapath=args.path, 40 | inductive=args.inductive, multil=args.multil, spt=args.spt, 41 | alpha=args.alpha, eps=args.eps, rrz=args.rrz, seed=args.seed) 42 | nclass = labels.shape[1] if args.multil else int(labels.max()) + 1 43 | 44 | model = DenseSkip(nfeat=feat['train'].shape[1], nlayers=args.layer, 45 | nhidden=args.hidden, nclass=nclass, 46 | dropout=args.dropout, bias=args.bias) 47 | print(model) 48 | model_logger.regi_model(model, save_init=False) 49 | if args.dev >= 0: 50 | model = model.cuda(args.dev) 51 | 52 | optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) 53 | scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, threshold=1e-4, patience=20, verbose=False) 54 | loss_fn = nn.BCEWithLogitsLoss() if args.multil else nn.CrossEntropyLoss() 55 | 56 | ds_train = Data.TensorDataset(feat['train'], labels[idx['train']]) 57 | loader_train = Data.DataLoader(dataset=ds_train, batch_size=args.batch, 58 | shuffle=True, num_workers=num_thread) 59 | ds_val = Data.TensorDataset(feat['val'], labels[idx['val']]) 60 | loader_val = Data.DataLoader(dataset=ds_val, batch_size=args.batch, 61 | shuffle=False, num_workers=num_thread) 62 | ds_test = Data.TensorDataset(feat['test'], labels[idx['test']]) 63 | loader_test = Data.DataLoader(dataset=ds_test, batch_size=args.batch, 64 | shuffle=False, num_workers=num_thread) 65 | 66 | 67 | def train(ld=loader_train): 68 | model.train() 69 | loss_list = [] 70 | time_epoch = 0 71 | for _, (batch_x, batch_y) in enumerate(ld): 72 | if args.dev >= 0: 73 | batch_x = batch_x.cuda(args.dev) 74 | batch_y = batch_y.cuda(args.dev) 75 | time_start = time.time() 76 | optimizer.zero_grad() 77 | output = model(batch_x) 78 | loss_batch = loss_fn(output, batch_y) 79 | loss_batch.backward() 80 | optimizer.step() 81 | time_epoch += (time.time()-time_start) 82 | loss_list.append(loss_batch.item()) 83 | return np.mean(loss_list), time_epoch 84 | 85 | 86 | def eval(ld): 87 | model.eval() 88 | micro, num_total = 0, 0 89 | with torch.no_grad(): 90 | for step, (batch_x, batch_y) in enumerate(ld): 91 | output_list, labels_list = None, None 92 | # if ((step + 1) % (len(ld) // 20) == 0): 93 | # print(f'{step + 1} {(step + 1) // (len(ld) // 10):g}: f1 {micro_test / num_test}') 94 | if args.dev >= 0: 95 | batch_x = batch_x.cuda(args.dev) 96 | output = model(batch_x) 97 | if not args.multil: 98 | output = output.max(1)[1] 99 | output_list = output.cpu().detach().numpy() 100 | labels_list = batch_y.detach().numpy() 101 | 102 | if args.multil: 103 | output_list[output_list > 0] = 1 104 | output_list[output_list <= 0] = 0 105 | micro_batch = f1_score(labels_list, output_list, average='micro') 106 | else: 107 | micro_batch = f1_score(labels_list, output_list, average='micro') 108 | micro += micro_batch * len(batch_y) 109 | num_total += len(batch_y) 110 | return micro / num_total 111 | 112 | 113 | def eval_v2(ld): 114 | """Deprecated for demanding larger memory.""" 115 | model.eval() 116 | micro, num_total = 0, 0 117 | with torch.no_grad(): 118 | for step, (batch_x, batch_y) in enumerate(ld): 119 | if args.dev >= 0: 120 | batch_x = batch_x.cuda(args.dev) 121 | output = model(batch_x) 122 | if not args.multil: 123 | output = output.max(1)[1] 124 | output_list = output.cpu().detach().numpy() 125 | labels_list = batch_y.detach().numpy() 126 | 127 | if args.multil: 128 | output_bool = np.zeros_like(output_list, dtype=bool) 129 | output_bool[output_list > 0] = True 130 | micro_batch = f1_score(labels_list, output_bool, average='micro') 131 | else: 132 | micro_batch = f1_score(labels_list, output_list, average='micro') 133 | micro += micro_batch * len(batch_y) 134 | num_total += len(batch_y) 135 | return micro / num_total 136 | 137 | 138 | print('-' * 20, flush=True) 139 | # print('Start training...') 140 | train_time = 0 141 | conv_epoch = 0 142 | 143 | for epoch in range(args.epochs): 144 | loss_train, train_ep = train() 145 | train_time += train_ep 146 | acc_val = eval(ld=loader_val) 147 | scheduler.step(acc_val) 148 | if (epoch+1) % 1 == 0: 149 | res = f"Epoch:{epoch:04d} | train loss:{loss_train:.4f}, val acc:{acc_val:.4f}, cost:{train_time:.4f}" 150 | logger.print(res) 151 | is_best = model_logger.save_best(acc_val, epoch=epoch, print_log=False) 152 | # Early stop if converge 153 | conv_epoch = 0 if is_best else conv_epoch + 1 154 | if conv_epoch == args.patience: 155 | break 156 | 157 | model = model_logger.load_model('best') 158 | acc_train = eval(ld=loader_train) 159 | print(f"Train time cost: {train_time:0.4f}") 160 | print(f"Train best acc: {acc_train:0.4f}, Val best acc: {model_logger.acc_best:0.4f}", flush=True) 161 | 162 | print('-' * 20) 163 | # print("Start inference...") 164 | start = time.time() 165 | acc_test = eval(ld=loader_test) 166 | time_inference = time.time() - start 167 | memory = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss 168 | print(f"Test time cost: {time_inference:0.4f}, Memory: {memory / 2**20:.3f}GB") 169 | print(f'Best epoch: {model_logger.epoch_best}, Test acc: {acc_test:.4f}', flush=True) 170 | print('-' * 20) 171 | -------------------------------------------------------------------------------- /Precompute/BatchRandomWalk.h: -------------------------------------------------------------------------------- 1 | /* 2 | Cached Random Walk in Batch 3 | Author: nyLiao 4 | */ 5 | #ifndef SCARA_BATCHRANDOMWALK_H 6 | #define SCARA_BATCHRANDOMWALK_H 7 | 8 | #ifdef ENABLE_RW 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "BasicDefinition.h" 15 | #include "HelperFunctions.h" 16 | #include "Graph.h" 17 | #include "fastPRNG.h" 18 | 19 | 20 | class XoshiroGenerator { 21 | private: 22 | fastPRNG::fastXS64 generator; 23 | public: 24 | void initialize(uint64_t seed) { 25 | generator.seed(seed); 26 | } 27 | 28 | inline double uniform_real() { 29 | return generator.xorShift_UNI(); 30 | } 31 | 32 | inline uint32_t uniform_int(const uint32_t &_end) { 33 | return generator.xorShift() % _end; 34 | } 35 | 36 | inline bool bias_coin_is_head(const double &_prob) { 37 | return generator.xorShift_UNI() <= _prob; 38 | } 39 | 40 | inline bool bias_coin_is_tail(const double &_prob) { 41 | return generator.xorShift_UNI() > _prob; 42 | } 43 | }; 44 | 45 | extern XoshiroGenerator fRNG; 46 | 47 | extern XoshiroGenerator init_rng(uint64_t seed); 48 | 49 | 50 | template 51 | class Alias { 52 | private: 53 | const unsigned int size; 54 | const IntVector &first; 55 | IntVector second; 56 | std::vector probability; 57 | public: 58 | Alias(const IntVector &_active_ids, const std::vector &_active_residuals) : 59 | size(_active_ids.size()), 60 | first(_active_ids), 61 | second(_active_ids.size(), 0), 62 | probability(_active_residuals) { 63 | const FLT sum = std::accumulate(_active_residuals.begin(), _active_residuals.end(), 0.0); 64 | std::stack> small; 65 | std::stack> big; 66 | const FLT size_over_sum = size / sum; 67 | for (NInt id = 0; id < size; ++id) { 68 | probability[id] *= size_over_sum; 69 | if (probability[id] > 1) { 70 | big.push(id); 71 | } else { 72 | small.push(id); 73 | } 74 | } 75 | while (!small.empty() && !big.empty()) { 76 | const NInt small_id = small.top(); 77 | small.pop(); 78 | const NInt big_id = big.top(); 79 | second[small_id] = first[big_id]; 80 | probability[big_id] -= (1 - probability[small_id]); 81 | if (probability[big_id] < 1) { 82 | small.push(big_id); 83 | big.pop(); 84 | } 85 | } 86 | } 87 | 88 | inline NInt generate_random_id() const { 89 | const unsigned int bucket_id = fRNG.uniform_int(size); 90 | return fRNG.bias_coin_is_head(probability[bucket_id]) ? first[bucket_id] : second[bucket_id]; 91 | } 92 | 93 | }; 94 | 95 | 96 | class WalkCache { 97 | private: 98 | const Graph &graph; 99 | IntVector walks; 100 | IntVector start_indices; 101 | 102 | public: 103 | 104 | explicit WalkCache(const Graph &_graph) : 105 | graph(_graph), 106 | walks(_graph.getNumOfEdges() + _graph.get_num_dead_end(), 0), 107 | start_indices(_graph.getNumOfVertices(), 0) { 108 | } 109 | 110 | void generate() { 111 | double time_start = getCurrentTime(); 112 | const NInt num_vertices = graph.getNumOfVertices(); 113 | for (NInt sid = 0, index = 0; sid < num_vertices; ++sid) { 114 | // if (sid % 500000 == 0) { cout << sid << " vertices processed.\n"; } 115 | start_indices[sid] = index; 116 | const NInt &sid_idx_start = graph.get_neighbor_list_start_pos(sid); 117 | const NInt &sid_idx_end = graph.get_neighbor_list_start_pos(sid + 1); 118 | const NInt sid_degree = sid_idx_end - sid_idx_start; 119 | for (uint32_t j = 0; j < sid_degree; ++j) { 120 | const NInt sid_shift = fRNG.uniform_int(sid_degree); 121 | NInt current_id = graph.getOutNeighbor(sid_idx_start + sid_shift); 122 | while (fRNG.bias_coin_is_tail(graph.get_alpha())) { 123 | // TODO: stop at L-hop 124 | const NInt &idx_start = graph.get_neighbor_list_start_pos(current_id); 125 | const NInt &idx_end = graph.get_neighbor_list_start_pos(current_id + 1); 126 | const NInt degree = idx_end - idx_start; 127 | const NInt shift = fRNG.uniform_int(degree); 128 | const NInt nid = graph.getOutNeighbor(idx_start + shift); 129 | current_id = nid; 130 | } 131 | walks[index++] = current_id; 132 | } 133 | } 134 | printf("Walk Cache Time: %.6f\n", getCurrentTime() - time_start); 135 | } 136 | 137 | void save(const std::string &_filename) const { 138 | const auto start = getCurrentTime(); 139 | if (std::FILE *f = std::fopen(_filename.c_str(), "wb")) { 140 | std::fwrite(walks.data(), sizeof walks[0], walks.size(), f); 141 | std::fclose(f); 142 | } else { 143 | printf("WalkCache::save; File Not Exists.\n"); 144 | } 145 | const auto end = getCurrentTime(); 146 | // printf("Time Used For Saving : %.2f\n", end - start); 147 | } 148 | 149 | void load(const std::string &_filename) { 150 | const auto start = getCurrentTime(); 151 | walks.clear(); 152 | walks.resize(graph.getNumOfEdges() + graph.get_num_dead_end(), 0); 153 | assert(walks.size() == graph.get_neighbor_list_start_pos(graph.getNumOfVertices())); 154 | if (std::FILE *f = std::fopen(_filename.c_str(), "rb")) { 155 | MSG(walks.size()) 156 | size_t rtn = std::fread(walks.data(), sizeof walks[0], walks.size(), f); 157 | // printf("Returned Value of fread: %zu\n", rtn); 158 | std::fclose(f); 159 | start_indices.clear(); 160 | start_indices.resize(graph.getNumOfVertices(), 0); 161 | for (NInt prev_id = 0, id = 1; id < graph.getNumOfVertices(); ++prev_id, ++id) { 162 | start_indices[id] = 163 | start_indices[prev_id] 164 | + std::max((NInt) 1u, graph.original_out_degree(prev_id)); 165 | } 166 | } else { 167 | printf("WalkCache::load; File Not Exists.\n"); 168 | exit(1); 169 | } 170 | const auto end = getCurrentTime(); 171 | // printf("Time Used For Loading Cache : %.2f\n", end - start); 172 | } 173 | 174 | inline const NInt &get_zero_hop_start_index(const NInt &_vid) const { 175 | // assert(_vid < graph.getNumOfVertices()); 176 | return start_indices[_vid]; 177 | } 178 | 179 | inline NInt get_one_hop_start_index(const NInt &_vid) const { 180 | // assert(_vid < graph.getNumOfVertices()); 181 | return start_indices[_vid]; 182 | } 183 | 184 | inline const NInt &get_walk(const NInt &_index) const { 185 | assert(_index < walks.size()); 186 | return walks[_index]; 187 | } 188 | }; 189 | 190 | #endif 191 | #endif //SCARA_BATCHRANDOMWALK_H 192 | -------------------------------------------------------------------------------- /LICENSE.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1252\cocoartf2706 2 | \cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Arial-BoldItalicMT;\f1\fswiss\fcharset0 ArialMT;\f2\fswiss\fcharset0 Arial-BoldMT; 3 | \f3\froman\fcharset0 TimesNewRomanPS-BoldMT;\f4\fswiss\fcharset0 Arial-ItalicMT;\f5\froman\fcharset0 TimesNewRomanPSMT; 4 | } 5 | {\colortbl;\red255\green255\blue255;\red0\green0\blue128;} 6 | {\*\expandedcolortbl;;\csgenericrgb\c0\c0\c50196;} 7 | {\*\listtable{\list\listtemplateid1\listhybrid{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace360\levelindent0{\*\levelmarker \{decimal\}.}{\leveltext\leveltemplateid1\'02\'00.;}{\levelnumbers\'01;}\fi-360\li720\lin720 }{\listname ;}\listid1}} 8 | {\*\listoverridetable{\listoverride\listid1\listoverridecount0\ls1}} 9 | {\info 10 | {\title Microsoft Research License Agreement} 11 | {\author aphillip} 12 | {\*\company Microsoft Corporation}}\vieww12540\viewh16140\viewkind1 13 | \deftab720 14 | \pard\pardeftab720\ri0\sb100\sa100\partightenfactor0 15 | 16 | \f0\i\b\fs22 \cf0 NANYANG TECHNOLOGICAL UNIVERSITY - NTUITIVE PTE LTD (NTUITIVE) Dual License Agreement\ 17 | \pard\pardeftab720\ri0\qj\partightenfactor0 18 | \cf0 Non-Commercial Use Only 19 | \f1\i0\b0 This NTUITIVE License Agreement, including all exhibits ("NTUITIVE-LA") is a legal agreement between you and NTUITIVE (or \'93we\'94) located at 71 Nanyang Drive, NTU Innovation Centre, #01-109, Singapore 637722, a wholly owned subsidiary of Nanyang Technological University (\'93 20 | \f2\b NTU 21 | \f1\b0 \'94) for the software or data identified above, which may include source code, and any associated materials, text or speech files, associated media and "online" or electronic documentation 22 | \fs24 \cf2 23 | \fs22 \cf0 and any updates we provide in our discretion (together, the "Software"). \ 24 | \ 25 | By installing, copying, or otherwise using this Software, found at PVLDB/GitHub, you agree to be bound by the terms of this NTUITIVE-LA. If you do not agree, do not install copy or use the Software. The Software is protected by copyright and other intellectual property laws and is licensed, not sold. If you wish to obtain a commercial royalty bearing license to this software please contact us at liao0090 at e.ntu.edu.sg\ 26 | \pard\pardeftab720\ri0\qj\partightenfactor0 27 | \cf0 \strike \strikec0 \ 28 | \pard\pardeftab720\ri0\sb100\sa100\qj\partightenfactor0 29 | 30 | \f2\b \cf0 \strike0\striked0 SCOPE OF RIGHTS:\ 31 | \pard\pardeftab720\ri0\sb100\sa100\qj\partightenfactor0 32 | 33 | \f1\b0 \cf0 You may use, copy, reproduce, and distribute this Software for any non-commercial purpose, subject to the restrictions in this NTUITIVE-LA. Some purposes which can be non-commercial are teaching, academic research, public demonstrations and personal experimentation. You may also distribute this Software with books or other teaching materials, or publish the Software on websites, that are intended to teach the use of the Software for academic or other non-commercial purposes.\ 34 | You may not use or distribute this Software or any derivative works in any form for commercial purposes. Examples of commercial purposes would be running business operations, licensing, leasing, or selling the Software, distributing the Software for use with commercial products, using the Software in the creation or use of commercial products or any other activity which purpose is to procure a commercial gain to you or others.\ 35 | If the Software includes source code or data, you may create derivative works of such portions of the Software and distribute the modified Software for non-commercial purposes, as provided herein. \ 36 | If you distribute the Software or any derivative works of the Software, you will distribute them under the same terms and conditions as in this license, and you will not grant other rights to the Software or derivative works that are different from those provided by this NTUITIVE-LA. \ 37 | \pard\pardeftab720\ri0\sb100\sa240\qj\partightenfactor0 38 | \cf0 If you have created derivative works of the Software, and distribute such derivative works, you will cause the modified files to carry prominent notices so that recipients know that they are not receiving the original Software. Such notices must state: (i) that you have changed the Software; and (ii) the date of any changes.\ 39 | \pard\pardeftab720\ri0\sb100\sa100\qj\partightenfactor0 40 | \cf0 \ 41 | You may not distribute this Software or any derivative works. \ 42 | \pard\pardeftab720\ri0\sb100\sa100\partightenfactor0 43 | \cf0 In return, we simply require that you agree: \ 44 | \pard\tx720\pardeftab720\li720\fi-360\ri0\sb100\sa240\qj\partightenfactor0 45 | \ls1\ilvl0\cf0 1. That you will not remove any copyright or other notices from the Software.\ 46 | 2. That if any of the Software is in binary format, you will not attempt to modify such portions of the Software, or to reverse engineer or decompile them, except and only to the extent authorized by applicable law. \ 47 | 3. That NTUITIVE is granted back, without any restrictions or limitations, a non-exclusive, perpetual, irrevocable, royalty-free, assignable and sub-licensable license, to reproduce, publicly perform or display, install, use, modify, post, distribute, make and have made, sell and transfer your modifications to and/or derivative works of the Software source code or data, for any purpose. \ 48 | 4. That any feedback about the Software provided by you to us is voluntarily given, and NTUITIVE shall be free to use the feedback as it sees fit without obligation or restriction of any kind, even if the feedback is designated by you as confidential. \ 49 | \pard\pardeftab720\li720\fi-360\ri0\sb120\sa120\partightenfactor0 50 | \cf0 5. THAT THE SOFTWARE COMES "AS IS", WITH NO WARRANTIES. THIS MEANS NO EXPRESS, IMPLIED OR STATUTORY WARRANTY, INCLUDING WITHOUT LIMITATION, WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, any warranty against interference with your enjoyment of the Software OR ANY WARRANTY OF TITLE OR NON-INFRINGEMENT. There is no warranty that this Software will fulfill any of your particular purposes or needs. ALSO, YOU MUST PASS THIS DISCLAIMER ON WHENEVER YOU DISTRIBUTE THE SOFTWARE OR DERIVATIVE WORKS.\ 51 | \pard\pardeftab720\li720\fi-360\ri0\sb100\sa240\qj\partightenfactor0 52 | \cf0 6. THAT NEITHER NTUITIVE NOR NTU NOR ANY CONTRIBUTOR TO THE SOFTWARE WILL BE LIABLE FOR ANY DAMAGES RELATED TO THE SOFTWARE OR THIS NTUITIVE-LA, INCLUDING DIRECT, INDIRECT, SPECIAL, CONSEQUENTIAL OR INCIDENTAL DAMAGES, TO THE MAXIMUM EXTENT THE LAW PERMITS, NO MATTER WHAT LEGAL THEORY IT IS BASED ON. ALSO, YOU MUST PASS THIS LIMITATION OF LIABILITY ON WHENEVER YOU DISTRIBUTE THE SOFTWARE OR DERIVATIVE WORKS.\ 53 | 7. That we have no duty of reasonable care or lack of negligence, and we are not obligated to (and will not) provide technical support for the Software.\ 54 | 8. That if you breach this NTUITIVE-LA or if you sue anyone over patents that you think may apply to or read on the Software or anyone's use of the Software, this NTUITIVE-LA (and your license and rights obtained herein) terminate automatically. Upon any such termination, you shall destroy all of your copies of the Software immediately. Sections 3, 4, 5, 6, 7, 8, 11 and 12 of this NTUITIVE-LA shall survive any termination of this NTUITIVE-LA.\ 55 | 9. That the patent rights, if any, granted to you in this NTUITIVE-LA only apply to the Software, not to any derivative works you make.\ 56 | 10. That the Software may be subject to U.S. export jurisdiction at the time it is licensed to you, and it may be subject to additional export or import laws in other places.\'a0 You agree to comply with all such laws and regulations that may apply to the Software after delivery of the software to you.\ 57 | 11. That all rights not expressly granted to you in this NTUITIVE-LA are reserved.\ 58 | 12. That this NTUITIVE-LA shall be construed and controlled by the laws of the Republic of Singapore without regard to conflicts of law. If any provision of this NTUITIVE-LA shall be deemed unenforceable or contrary to law, the rest of this NTUITIVE-LA shall remain in full effect and interpreted in an enforceable manner that most nearly captures the intent of the original language. \ 59 | \pard\pardeftab720\ri0\qj\partightenfactor0 60 | 61 | \f3\b\fs24 \cf0 \page \ 62 | \pard\pardeftab720\ri0\qj\partightenfactor0 63 | 64 | \f4\i\b0\fs22 \cf0 Do you accept all of the terms of the preceding NTUITIVE-LA license agreement? If you accept the terms, click \'93I Agree,\'94 then \'93Next.\'94 Otherwise click \'93Cancel.\'94\ 65 | \ 66 | Copyright (c) NTUITIVE. All rights reserved.\ 67 | \pard\pardeftab720\ri0\qj\partightenfactor0 68 | 69 | \f3\i0\b\fs24 \cf0 \ 70 | \ 71 | \pard\pardeftab720\ri0\partightenfactor0 72 | 73 | \f5\b0 \cf0 \ 74 | } -------------------------------------------------------------------------------- /Precompute/FeatureOp.h: -------------------------------------------------------------------------------- 1 | /* 2 | Feature calculation 3 | Author: nyLiao 4 | */ 5 | #ifndef SCARA_FEATUREOP_H 6 | #define SCARA_FEATUREOP_H 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "BasicDefinition.h" 13 | #include "MyType.h" 14 | #include "FeatureDecomp.h" 15 | 16 | 17 | // ==================== Basic 18 | template 19 | inline FLT vector_L1(const std::vector Vec){ 20 | FLT sum = 0; 21 | for(FLT a : Vec) 22 | sum += fabsf(a); 23 | return sum; 24 | } 25 | 26 | template 27 | inline FLT vector_L2(const std::vector Vec){ 28 | FLT sum = 0; 29 | for(FLT a : Vec) 30 | sum += a * a; 31 | return sqrtf(sum); 32 | } 33 | 34 | template 35 | inline IntVector arg_kmin(const std::vector &Vec, const NInt k) { 36 | /* Get the last-k indices of Vec */ 37 | std::priority_queue, 38 | std::vector< std::pair >, 39 | std::less >> q; 40 | for (NInt i = 0; i < Vec.size(); i++) { 41 | if (q.size() < k) 42 | q.push({Vec[i], i}); 43 | else if (Vec[i] < q.top().first) { 44 | q.pop(); 45 | q.push({Vec[i], i}); 46 | } 47 | } 48 | IntVector res(k); 49 | for (NInt i = 0; i < k; i++) { 50 | res[i] = q.top().second; 51 | q.pop(); 52 | } 53 | return res; 54 | } 55 | 56 | template 57 | inline IntVector arg_decsort_abs(const std::vector &Vec) { 58 | /* Get the indices of Vec in descending order */ 59 | IntVector res(Vec.size()); 60 | std::iota(res.begin(), res.end(), 0); 61 | std::sort(res.begin(), res.end(), 62 | [&Vec](NInt i1, NInt i2) { return fabs(Vec[i1]) > fabs(Vec[i2]); }); 63 | return res; 64 | } 65 | 66 | // ==================== Vector measurement 67 | template 68 | inline FLT calc_L1_residue(std::vector &V1, const std::vector &V2, const float pace = 1.0) { 69 | NInt index; 70 | FLT used_sum = 0; 71 | FLT theta; 72 | // std::vector> theta_counter; 73 | std::vector> theta_counter; 74 | for (NInt i = 0; i < V1.size(); i++) { 75 | if (V2[i] != 0) { 76 | theta = V1[i] / V2[i]; 77 | theta_counter.push_back({i, theta}); 78 | } 79 | } 80 | std::sort(theta_counter.begin(), theta_counter.end(), 81 | [](const IdScorePair &a, const IdScorePair &b) { 82 | return fabsf(a.score) < fabsf(b.score); 83 | }); 84 | for (NInt i = 0; i < theta_counter.size(); i += 1) { 85 | index = theta_counter[i].id; 86 | theta = theta_counter[i].score; 87 | used_sum += fabsf(V2[index]); 88 | if (used_sum > 0.5 || index == -1) 89 | break; 90 | } 91 | 92 | FLT orig_sum = 0; 93 | FLT diff_sum = 0; 94 | for (NInt i = 0; i < V1.size(); i++) { 95 | orig_sum += fabsf(V1[i]); 96 | diff_sum += fabsf(V1[i] - theta * V2[i] * pace); 97 | } 98 | if (diff_sum > orig_sum) 99 | return 0; 100 | 101 | for (NInt i = 0; i < V1.size(); i++) { 102 | V1[i] = V1[i] - theta * V2[i] * pace; 103 | } 104 | // printf("theta: %.6f, residue: %.6f\n", theta, diff_sum); 105 | return theta * pace; 106 | } 107 | 108 | template 109 | inline FLT calc_L2_residue(std::vector &V1, const std::vector &V2, const float pace = 1.0) { 110 | FLT prd = 0; 111 | FLT sum2 = 0; 112 | for (NInt i = 0; i < V1.size(); i++) { 113 | prd += V1[i] * V2[i]; 114 | sum2 += V2[i] * V2[i]; 115 | } 116 | FLT theta = prd / sum2; 117 | if (fabs(theta * pace) < 1e-4) 118 | return 0; 119 | 120 | FLT orig_sum = 0; 121 | FLT diff_sum = 0; 122 | for (NInt i = 0; i < V1.size(); i++) { 123 | orig_sum += fabsf(V1[i]); 124 | diff_sum += fabsf(V1[i] - theta * V2[i] * pace); 125 | } 126 | if (diff_sum > orig_sum) 127 | return 0; 128 | 129 | for (NInt i = 0; i < V1.size(); i++) { 130 | V1[i] = V1[i] - theta * V2[i] * pace; 131 | } 132 | // printf("theta: %.6f, residue: %.6f\n", theta, diff_sum); 133 | return theta * pace; 134 | } 135 | 136 | template 137 | inline FLT calc_L1_distance(const std::vector &V1, const std::vector &V2) { 138 | /* Ranges: 139 | V~[0, 1] -> distance~[0, 1] 140 | V~[-1, 1] -> distance~[0, 2] 141 | */ 142 | FLT distance = 0; 143 | for (NInt i = 0; i < V1.size(); i++) { 144 | distance += fabsf(V1[i] - V2[i]); 145 | } 146 | return distance; 147 | } 148 | 149 | template 150 | inline FLT calc_L2_distance(const std::vector &V1, const std::vector &V2) { 151 | /* Ranges: (cosine angle = prd / (sqrt(sum1) * sqrt(sum2))) 152 | V~[0, 1] -> distance~[0, 1] 153 | V~[-1, 1] -> distance~[0, 2] 154 | */ 155 | // TODO: cache for feature-feature product and difference 156 | FLT distance = 0; 157 | FLT prd = 0; 158 | FLT sum1 = 0; 159 | FLT sum2 = 0; 160 | for (NInt i = 0; i < V1.size(); i++) { 161 | prd += V1[i] * V2[i]; 162 | sum1 += V1[i] * V1[i]; 163 | sum2 += V2[i] * V2[i]; 164 | } 165 | // distance = 1 - prd / (sqrt(sum1) * sqrt(sum2)); 166 | distance = 1 - fabsf(prd / (sqrt(sum1) * sqrt(sum2))); 167 | return distance; 168 | } 169 | 170 | // ==================== Reuse functions 171 | inline IntVector select_base(MyMatrix &feat_matrix, const NInt base_size) { 172 | NInt feat_size = feat_matrix.nrows(); 173 | IntVector base_idx(base_size, 0); // index of base features 174 | std::vector> min_counter(feat_size, {0, 0.0}); // (min base id, min norm) for each feature 175 | // Find minimum distance feature for each feature 176 | for (NInt i = 0; i < feat_size; i++) { 177 | ScoreFlt dis_min = 4.0 * feat_size; 178 | NInt idx_min = -1; 179 | for (NInt j = 0; j < feat_size; j++) { 180 | if (i != j) { 181 | ScoreFlt dis = calc_L2_distance(feat_matrix[i], feat_matrix[j]); 182 | if (dis_min > dis) { 183 | dis_min = dis; 184 | idx_min = j; 185 | } 186 | } 187 | } 188 | // printf("id: %4d, dis: %.8f, tar: %4d\n", i, dis_min, idx_min); 189 | if (idx_min < 0 || idx_min > feat_size) continue; 190 | min_counter[idx_min].id = idx_min; 191 | // Add weight for counter, distance closer to 1 is smaller weight 192 | min_counter[idx_min].score += fabsf(1 - dis_min); 193 | } 194 | 195 | // Decide base features with most closest features 196 | std::sort(min_counter.begin(), min_counter.end(), IdScorePairComparatorGreater()); 197 | for (NInt i = 0; i < base_size; i++) { 198 | // printf("Base %4d: dis: %.8f, tar: %4d\n", i, min_counter[i].score, min_counter[i].id); 199 | base_idx[i] = min_counter[i].id; 200 | } 201 | return base_idx; 202 | } 203 | 204 | inline FltVector reuse_weight(FltVector &feat_vector, const MyMatrix &base_matrix) { 205 | FltVector base_weight(base_matrix.nrows(), 0.0); 206 | for (ScoreFlt delta = 1; delta <= 16; delta *= 2) { 207 | ScoreFlt dis_min = base_matrix.nrows(); 208 | NInt idx_min = 0; 209 | for (NInt j = 0; j < base_matrix.nrows(); j++) { 210 | ScoreFlt dis = calc_L2_distance(feat_vector, base_matrix[j]); 211 | if (dis_min > dis) { 212 | dis_min = dis; 213 | idx_min = j; 214 | } 215 | } 216 | ScoreFlt theta = calc_L2_residue(feat_vector, base_matrix[idx_min], 1.0); 217 | if (fabs(theta) / delta < 1 / 16) break; 218 | base_weight[idx_min] += theta; 219 | } 220 | return base_weight; 221 | } 222 | 223 | // ==================== Decomposition functions 224 | inline IntVector sample_nodes(const IntVector Vt_nodes, NInt Vs_num) { 225 | IntVector Vs_nodes(Vt_nodes); 226 | NInt Vs_num_real(std::min(Vs_num, (NInt)Vt_nodes.size())); 227 | std::shuffle(Vs_nodes.begin(), Vs_nodes.end(), std::mt19937(param.seed)); 228 | Vs_nodes.resize(Vs_num_real); 229 | return Vs_nodes; 230 | } 231 | 232 | inline IntVector select_pc(ScoreMatrix &feat_Matrix, MyMatrix &theta_matrix, 233 | const NInt base_size, const ScoreFlt mul) { 234 | NInt feat_size = feat_Matrix.rows(); 235 | NInt V_num = feat_Matrix.cols(); 236 | assert(feat_size < V_num && "ERROR: Feature size should be smaller than sampled vertex number"); 237 | feat_Matrix.transposeInPlace(); // feat_Matrix: Vs_num * feat_size 238 | 239 | // Select base features (columns) by minimum residue 240 | RobustPca Rpca(feat_Matrix, mul); 241 | Rpca.fit(base_size); 242 | ScoreVector feat_Res_ = Rpca.SparseComponent().colwise().norm(); 243 | FltVector feat_res(feat_Res_.data(), feat_Res_.data() + feat_Res_.size()); 244 | IntVector base_idx = arg_kmin(feat_res, base_size); 245 | ScoreMatrix base_Matrix = feat_Matrix(Eigen::all, base_idx); // base_Matrix: Vs_num * base_size 246 | 247 | // Fit theta matrix 248 | ScoreMatrix theta_Matrix_ = Rpca.fit_fixed(base_Matrix); 249 | theta_matrix.from_Eigen(theta_Matrix_.transpose()); // theta_Matrix_: feat_size * base_size 250 | ScoreMatrix lrcn = base_Matrix * theta_Matrix_; 251 | ScoreMatrix diff = feat_Matrix - lrcn; 252 | cout<< " lrcn Fro norm: "<() << endl; 253 | cout<< " diff Fro norm: "<() << endl; 254 | // std::ofstream file1("output_theta.txt"); 255 | // file1 << theta_Matrix_.transpose(); 256 | return base_idx; 257 | } 258 | 259 | #endif // SCARA_FEATUREOP_H 260 | -------------------------------------------------------------------------------- /Train/loader.py: -------------------------------------------------------------------------------- 1 | import gc 2 | import numpy as np 3 | from sklearn.preprocessing import StandardScaler 4 | import torch 5 | 6 | from data_processor import DataProcess, diag_sp 7 | 8 | 9 | np.set_printoptions(linewidth=160, edgeitems=5, threshold=20, 10 | formatter=dict(float=lambda x: "% 9.3e" % x)) 11 | torch.set_printoptions(linewidth=160, edgeitems=5) 12 | 13 | 14 | def lmatstd(m): 15 | """Large matrix standardization""" 16 | rowh = m.shape[0] // 2 17 | std = np.std(m[:rowh], axis=0) 18 | m[:rowh] /= std 19 | m[rowh:] /= std 20 | gc.collect() 21 | return m 22 | 23 | 24 | def matstd_clip(m, idx, with_mean=False): 25 | """Standardize and clip per feature""" 26 | # idx = np.setdiff1d(idx, [0]) 27 | # if (len(idx) > 0.75 * m.shape[0]) and (m.shape[0] > 2,000,000): 28 | # idx = np.random.choice(idx, size=int(len(idx)/5), replace=False) 29 | scaler = StandardScaler(with_mean=with_mean) 30 | scaler.fit(m[idx]) 31 | mean, std = scaler.mean_, scaler.scale_ 32 | k = 3 33 | m = np.clip(m, a_min=mean-k*std, a_max=mean+k*std) 34 | m = scaler.transform(m) 35 | return m 36 | 37 | 38 | def diag_mul(diag, m): 39 | """Diagonal matrix multiplication""" 40 | row = m.shape[0] 41 | for i in range(row): 42 | m[i] *= diag[i] 43 | return m 44 | 45 | 46 | # ==================== 47 | def load_node_data(algo: str, datastr: str, datapath: str, 48 | inductive: bool, multil: bool, spt: int, 49 | alpha: float, eps: float, rrz: float, seed: int=0): 50 | if datastr == 'paper': 51 | return load_paper_data(algo, datastr, datapath, inductive, multil, spt, alpha, eps, rrz, seed) 52 | 53 | print('-' * 20) 54 | # print("Start loading...") 55 | # Get degree and label 56 | processor = DataProcess(datastr, path=datapath, rrz=rrz, seed=seed) 57 | processor.input(['deg', 'labels']) 58 | deg = processor.deg 59 | if multil: 60 | processor.calculate(['labels_oh']) 61 | labels = torch.LongTensor(processor.labels_oh) 62 | labels = labels.float() 63 | else: 64 | labels = torch.LongTensor(processor.labels) 65 | # Get index 66 | if inductive: 67 | processor.input(['idx_train', 'idx_val', 'idx_test']) 68 | else: 69 | processor.calculate(['idx_train']) 70 | idx = {'train': torch.LongTensor(processor.idx_train), 71 | 'val': torch.LongTensor(processor.idx_val), 72 | 'test': torch.LongTensor(processor.idx_test)} 73 | # Get graph property 74 | n, m = processor.n, processor.m 75 | 76 | # Precompute integration 77 | def precompute(algo_i, idx_fit): 78 | # Load embedding 79 | est_dir = f'../save/{datastr}/{algo_i}/{seed}' 80 | if spt == 1: 81 | est_file = f'{est_dir}/score_{alpha:g}_{eps:g}.npy' 82 | features = np.load(est_file) 83 | else: 84 | features = None 85 | for i in range(spt): 86 | est_file = f'{est_dir}/score_{alpha:g}_{eps:g}_{i}.npy' 87 | features_spt = np.load(est_file) 88 | if features is None: 89 | features = features_spt.astype(np.float32) 90 | else: 91 | np.concatenate((features, features_spt), axis=0, out=features, dtype=np.float32) 92 | print(f' Split {i} loaded, now shape: {features.shape}') 93 | features = features.transpose() # shape [n, F] 94 | 95 | # Process degree 96 | if algo_i.endswith('_train'): 97 | processor_i = DataProcess(datastr+'_train', path=datapath, rrz=rrz, seed=seed) 98 | processor_i.input(['deg']) 99 | deg_i = processor_i.deg 100 | else: 101 | deg_i = deg 102 | deg_pow = np.power(np.maximum(deg_i, 1e-12), rrz - 1) 103 | idx_zero = np.where(deg_i == 0)[0] 104 | if len(idx_zero) > 0: 105 | print(f"Warning: {len(idx_zero)} isolated nodes found: {idx_zero}!") 106 | deg_pow[idx_zero] = 0 107 | 108 | # Normalize embedding by degree 109 | if spt == 1: 110 | deg_pow = diag_sp(deg_pow) 111 | features = deg_pow @ features # shape [n, F] 112 | else: 113 | features = diag_mul(deg_pow, lmatstd(features)) 114 | print(f'{algo_i} all head no std') 115 | print(features[:5, :]) 116 | print(f'{algo_i} train head no std ', idx_fit[:5]) 117 | print(features[idx_fit[:5], :]) 118 | features = matstd_clip(features, idx_fit, with_mean=True) 119 | # features = matstd_clip(features, idx_fit, with_mean=False) 120 | print(f'{algo_i} all head ') 121 | print(features[:5, :], flush=True) 122 | return features 123 | 124 | # Assign features 125 | features = precompute(algo, idx['train']) 126 | feat = {'val': torch.FloatTensor(features[idx['val']]), 127 | 'test': torch.FloatTensor(features[idx['test']])} 128 | if inductive: 129 | features_train = precompute(f'{algo}_train', np.arange(len(idx['train']))) 130 | feat['train'] = torch.FloatTensor(features_train) 131 | del features, features_train 132 | else: 133 | feat['train'] = torch.FloatTensor(features[idx['train']]) 134 | del features 135 | gc.collect() 136 | 137 | print('train head ', idx['train'][:5]) 138 | print(feat['train'][:5, :].numpy()) 139 | # print('test head ', idx['test'][:5]) 140 | # print(feat['test'][:5, :]) 141 | # print(labels.size(), labels) 142 | print(f"n={n}, m={m}, F_t={feat['train'].size()}") 143 | print(f"n_train={idx['train'].size()}, n_val={idx['val'].size()}, n_test={idx['test'].size()}", flush=True) 144 | return feat, labels, idx 145 | 146 | 147 | def load_paper_data(algo: str, datastr: str, datapath: str, 148 | inductive: bool, multil: bool, spt: int, 149 | alpha: float, eps: float, rrz: float, seed: int=0): 150 | print('-' * 20) 151 | print("Start loading paper...") 152 | # Get degree and label 153 | processor = DataProcess(datastr, path=datapath, rrz=rrz, seed=seed) 154 | processor.input(['deg', 'labels']) 155 | deg = processor.deg 156 | labels = torch.LongTensor(processor.labels) 157 | # Get index 158 | processor.input(['idx_train', 'idx_val', 'idx_test']) 159 | ridx = {'train': torch.LongTensor(processor.idx_train), 160 | 'val': torch.LongTensor(processor.idx_val), 161 | 'test': torch.LongTensor(processor.idx_test)} 162 | ridx_all = np.concatenate((processor.idx_train, processor.idx_val, processor.idx_test), dtype=np.int64) 163 | deg = deg[ridx_all] 164 | labels = labels[ridx_all] 165 | idx = {'train': torch.arange(processor.n_train, dtype=torch.long), 166 | 'val': torch.arange(processor.n_val, dtype=torch.long)+processor.n_train, 167 | 'test': torch.arange(processor.n_test, dtype=torch.long)+processor.n_train+processor.n_val} 168 | # Get graph property 169 | n, m = processor.n, processor.m 170 | 171 | # Precompute integration 172 | def precompute(algo_i, idx_fit): 173 | # Load embedding 174 | est_dir = f'../save/{datastr}/{algo_i}/{seed}' 175 | if spt == 1: 176 | est_file = f'{est_dir}/score_{alpha:g}_{eps:g}.npy' 177 | features = np.load(est_file) 178 | else: 179 | features = None 180 | for i in range(spt): 181 | est_file = f'{est_dir}/score_{alpha:g}_{eps:g}_{i}.npy' 182 | features_spt = np.load(est_file) 183 | if features is None: 184 | features = features_spt.astype(np.float32) 185 | else: 186 | np.concatenate((features, features_spt), axis=0, out=features, dtype=np.float32) 187 | print(f' Split {i} loaded, now shape: {features.shape}') 188 | features = features[:, ridx_all] 189 | features = features.transpose() # shape [n, F] 190 | gc.collect() 191 | 192 | # Process degree 193 | deg_i = deg 194 | deg_pow = np.power(np.maximum(deg_i, 1e-12), rrz - 1) 195 | idx_zero = np.where(deg_i == 0)[0] 196 | if len(idx_zero) > 0: 197 | print(f"Warning: {len(idx_zero)} isolated nodes found: {idx_zero}!") 198 | deg_pow[idx_zero] = 0 199 | 200 | # Normalize embedding by degree 201 | deg_pow = diag_sp(deg_pow) 202 | features = deg_pow @ lmatstd(features) # shape [n, F] 203 | # features = diag_mul(deg_pow, lmatstd(features)) 204 | print(f'{algo_i} all head no std') 205 | print(features[:5, :]) 206 | print(f'{algo_i} train head no std ', idx_fit[:5]) 207 | print(features[idx_fit[:5], :]) 208 | # features = matstd_clip(features, idx_fit, with_mean=True) 209 | features = matstd_clip(features, idx_fit, with_mean=False) 210 | # features = lmatstd(features) 211 | print(f'{algo_i} all head ') 212 | print(features[:5, :], flush=True) 213 | return features 214 | 215 | # Assign features 216 | features = precompute(algo, idx['train']) 217 | feat = {'val': torch.FloatTensor(features[idx['val']]), 218 | 'test': torch.FloatTensor(features[idx['test']])} 219 | if inductive: 220 | features_train = precompute(f'{algo}_train', np.arange(len(idx['train']))) 221 | feat['train'] = torch.FloatTensor(features_train) 222 | del features, features_train 223 | else: 224 | feat['train'] = torch.FloatTensor(features[idx['train']]) 225 | del features 226 | gc.collect() 227 | 228 | print('train head ', idx['train'][:5]) 229 | print(feat['train'][:5, :].numpy()) 230 | print(f"n={n}, m={m}, F_t={feat['train'].size()}") 231 | print(f"n_train={idx['train'].size()}, n_val={idx['val'].size()}, n_test={idx['test'].size()}", flush=True) 232 | return feat, labels, idx 233 | -------------------------------------------------------------------------------- /Precompute/MyType.h: -------------------------------------------------------------------------------- 1 | /* 2 | Matrix algebra computation 3 | Author: nyLiao 4 | */ 5 | #ifndef SCARA_MYTYPE_H 6 | #define SCARA_MYTYPE_H 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "npy.hpp" 13 | #include "BasicDefinition.h" 14 | 15 | typedef Eigen::Matrix NodeVector; 16 | typedef Eigen::Matrix ScoreVector; 17 | typedef Eigen::Map ScoreVectorMap; 18 | typedef Eigen::Matrix ScoreMatrix; 19 | 20 | 21 | /* 22 | * Matrix in 1D vector. 23 | */ 24 | class My2DVector { 25 | private: 26 | FltVector data; 27 | NInt nrow = 0; 28 | NInt ncol = 0; 29 | 30 | friend class My2DVectorRow; 31 | 32 | class My2DVectorRow { 33 | private: 34 | My2DVector &parent; 35 | NInt row; 36 | public: 37 | My2DVectorRow(My2DVector &_parent, const NInt &_row) : 38 | parent(_parent), row(_row) {} 39 | 40 | ScoreFlt &operator[] (const NInt &col) { 41 | return parent.data[row * parent.ncol + col]; 42 | } 43 | 44 | const ScoreFlt &operator[] (const NInt &col) const { 45 | return parent.data[row * parent.ncol + col]; 46 | } 47 | 48 | FltVector::iterator begin() { 49 | return parent.data.begin() + row * parent.ncol; 50 | } 51 | 52 | FltVector::iterator end() { 53 | return parent.data.begin() + (row + 1) * parent.ncol; 54 | } 55 | 56 | void erase() { 57 | // parent.data.erase(parent.data.begin() + row * parent.ncol, 58 | // parent.data.begin() + (row + 1) * parent.ncol); 59 | parent.nrow--; 60 | parent.data.resize(parent.nrow * parent.ncol); 61 | // NOTE: shrink_to_fit() reduces stat memory but slows down conversion 62 | // parent.data.shrink_to_fit(); 63 | } 64 | }; 65 | 66 | public: 67 | explicit My2DVector() {} 68 | 69 | explicit My2DVector(const NInt &_nrows, const NInt &_ncols) : 70 | data(_nrows * _ncols), 71 | nrow(_nrows), 72 | ncol(_ncols) { 73 | } 74 | 75 | void allocate(const NInt &_nrows, const NInt &_ncols) { 76 | data.resize(_nrows * _ncols); 77 | nrow = _nrows; 78 | ncol = _ncols; 79 | } 80 | 81 | My2DVectorRow operator[] (const NInt &row) { 82 | return My2DVectorRow(*this, row); 83 | } 84 | 85 | inline const NInt size() const { return nrow * ncol; } 86 | 87 | inline const NInt nrows() const { return nrow; } 88 | 89 | inline const NInt ncols() const { return ncol; } 90 | 91 | inline FltVector& get_data() { return data; } 92 | 93 | inline const FltVector& get_data() const { return data; } 94 | 95 | inline bool is_empty() const { 96 | return data.empty(); 97 | } 98 | 99 | inline void set_ncol(const NInt &_ncol) { ncol = _ncol; } 100 | 101 | inline void set_nrow(const NInt &_nrow) { nrow = _nrow; } 102 | 103 | inline void clear() { 104 | data.clear(); 105 | nrow = 0; 106 | ncol = 0; 107 | } 108 | 109 | void emplace_row(FltVector::iterator begin, FltVector::iterator end) { 110 | data.insert(data.end(), begin, end); 111 | nrow++; 112 | } 113 | 114 | void load_npy(const std::string file_path) { 115 | std::vector shape; 116 | bool fortran_order; 117 | data.clear(); 118 | npy::LoadArrayFromNumpy(file_path, shape, fortran_order, data); 119 | assert(fortran_order == false && "ERROR: Array should be in C order."); 120 | nrow = shape[0]; // feature size F 121 | ncol = shape[1]; // node num Vt_num 122 | printf("V2D RSS RAM: %.3f GB\n", get_stat_memory()); 123 | cout<<"Input file: "< res_shape {{nrow, ncol}}; 128 | npy::SaveArrayAsNumpy(file_path, false, res_shape.size(), res_shape.data(), data); 129 | cout<<"Saved file: "< data; 143 | 144 | public: 145 | explicit MyMatrix(const NInt _tail = 0) : tail(_tail) {} 146 | 147 | explicit MyMatrix(const NInt _nrows, const NInt _ncols, const NInt _tail = 0) : 148 | nrow(_nrows), 149 | ncol(_ncols), 150 | tail(_tail), 151 | data(_nrows, FltVector(_ncols+_tail)) {} 152 | 153 | void allocate(const NInt &_nrows, const NInt &_ncols) { 154 | // data.resize(_nrows, FltVector(_ncols, 0)); 155 | nrow = _nrows; 156 | ncol = _ncols; 157 | data = std::vector(_nrows, FltVector(_ncols+tail)); 158 | } 159 | 160 | FltVector &operator[] (const NInt &row) { 161 | return data[row]; 162 | } 163 | 164 | const FltVector &operator[] (const NInt &row) const { 165 | return data[row]; 166 | } 167 | 168 | inline const NInt size() const { return nrow * ncol; } 169 | 170 | inline const NInt nrows() const { return nrow; } 171 | 172 | inline const NInt ncols() const { return ncol; } 173 | 174 | inline bool is_empty() const { 175 | return nrow == 0; 176 | } 177 | 178 | inline bool is_regular(const NInt &row) const { 179 | return data[row].size() == ncol+tail; 180 | } 181 | 182 | inline bool is_regular() const { 183 | for (NInt i = 0; i < nrow; ++i) 184 | if (!is_regular(i)) 185 | return false; 186 | return true; 187 | } 188 | 189 | inline void set_size(const NInt &_nrow, const NInt &_ncol) { 190 | nrow = _nrow; 191 | ncol = _ncol; 192 | } 193 | 194 | inline void set_ncol(const NInt &_ncol) { ncol = _ncol; } 195 | 196 | inline void set_nrow(const NInt &_nrow) { nrow = _nrow; } 197 | 198 | inline void set_col(const NInt &col, const FltVector &_data) { 199 | assert(col < ncol); 200 | assert(_data.size() == nrow); 201 | for (NInt i = 0; i < nrow; ++i) { 202 | data[i][col] = _data[i]; 203 | } 204 | } 205 | 206 | inline void copy_row(const NInt &row, const FltVector &_data) { 207 | data[row] = _data; 208 | data[row].resize(ncol+tail); 209 | } 210 | 211 | void copy_rows(const IntVector row_idx, const MyMatrix &_data) { 212 | /* 213 | * row_idx[i] = j means copy _data[j] to data[i] 214 | */ 215 | // assert(row_idx.size() == nrow); 216 | for (NInt i = 0; i < row_idx.size(); ++i) { 217 | copy_row(i, _data[row_idx[i]]); 218 | } 219 | } 220 | 221 | void swap_rows(const IntVector row_idx, MyMatrix &_data) { 222 | // assert(row_idx.size() == nrow); 223 | for (NInt i = 0; i < row_idx.size(); ++i) { 224 | data[i].swap(_data[row_idx[i]]); 225 | } 226 | } 227 | 228 | void from_V2D(My2DVector matv2d, const IntVector Vt_nodes) { 229 | data.resize(nrow); 230 | if (matv2d.ncols() == ncol) { 231 | for (long i = nrow-1; i >= 0; --i) { 232 | data[i] = FltVector(std::make_move_iterator(matv2d[i].begin()), 233 | std::make_move_iterator(matv2d[i].end()) ); 234 | // append `tail` elements at end to be in line with SpeedPPR::gstruct.means 235 | for (NInt j = 0; j < tail; ++j) 236 | data[i].emplace_back(0); 237 | matv2d[i].erase(); 238 | } 239 | } else { 240 | // populate Vt_nodes to all nodes 241 | assert(matv2d.ncols() == Vt_nodes.size()); 242 | NInt idx = 0; 243 | for (long i = nrow-1; i >= 0; --i) { 244 | data[i] = FltVector(ncol+tail, 0); 245 | for (NInt j = 0; j < ncol; ++j) { 246 | if (Vt_nodes[idx] == j) { 247 | data[i][j] = matv2d[i][idx]; 248 | idx++; 249 | } 250 | } 251 | matv2d[i].erase(); 252 | } 253 | } 254 | matv2d.clear(); 255 | // ! Still require O(2n) RAM as My2DVectorRow::erase does not reallocate 256 | printf("Mat RSS RAM: %.3f GB\n", get_stat_memory()); 257 | cout<<"Load size: "< 10 | #include 11 | #include 12 | #include "BasicDefinition.h" 13 | #include "MyType.h" 14 | 15 | 16 | inline ScoreMatrix shrink(const ScoreMatrix& X, const ScoreFlt tol) { 17 | const ScoreMatrix a_plus = X.array() + tol; 18 | const ScoreMatrix a_minus = X.array() - tol; 19 | return a_plus.cwiseMin(0) + a_minus.cwiseMax(0); 20 | } 21 | 22 | /* 23 | * Randomized SVD for fast approximate matrix decomposition 24 | * Interface is same as Eigen's jacobiSVD 25 | */ 26 | class RandomizedSvd { 27 | public: 28 | RandomizedSvd(const ScoreMatrix& m, const int rank, int oversamples = 0, int iter = 2) 29 | : U_(), V_(), S_(), rank_(rank) { 30 | ComputeRandomizedSvd(m, oversamples, iter); 31 | } 32 | 33 | ScoreVector singularValues() { return S_; } // shape: rank 34 | ScoreMatrix matrixU() { return U_; } // shape: m.rows * rank 35 | ScoreMatrix matrixV() { return V_; } // shape: m.cols * rank 36 | 37 | ScoreMatrix pinv() { // shape: m.cols * m.rows 38 | ScoreVector Sinv(S_.size()); 39 | for (int i = 0; i < S_.size(); ++i) { 40 | Sinv(i) = (S_(i) > 1e-6) ? (1.0/S_(i)) : 0.0; 41 | } 42 | // if ((Sinv.array() == 0).count() > 0) 43 | // std::cout << "Warning: SVD matrix is singular" << std::endl; 44 | return V_ * Sinv.asDiagonal() * U_.transpose(); 45 | } 46 | 47 | private: 48 | ScoreMatrix U_, V_; 49 | ScoreVector S_; 50 | int rank_; 51 | 52 | /* 53 | * Main function for randomized svd 54 | * oversamples: additional samples/rank for accuracy, to account for random sampling 55 | */ 56 | void ComputeRandomizedSvd(const ScoreMatrix& A, int oversamples, int iter) { 57 | // If matrix is too small for desired rank/oversamples 58 | if ((rank_ + oversamples) > std::min(A.rows(), A.cols())) { 59 | rank_ = std::min(A.rows(), A.cols()); 60 | oversamples = 0; 61 | } 62 | 63 | ScoreMatrix Q = FindRandomizedRange(A, rank_ + oversamples, iter); 64 | ScoreMatrix B = Q.transpose() * A; 65 | 66 | // Compute the SVD on the thin matrix (much cheaper than SVD on original) 67 | Eigen::JacobiSVD svd(B, Eigen::ComputeThinU | Eigen::ComputeThinV); 68 | 69 | U_ = (Q * svd.matrixU()).block(0, 0, A.rows(), rank_); 70 | V_ = svd.matrixV().block(0, 0, A.cols(), rank_); 71 | S_ = svd.singularValues().head(rank_); 72 | } 73 | 74 | /* 75 | Finds a set of orthonormal vectors that approximates the range of A 76 | Basic idea is that finding orthonormal basis vectors for A*W, where W is set of some 77 | random vectors w_i, can approximate the range of A 78 | Most of the time/computation in the randomized SVD is spent here 79 | */ 80 | ScoreMatrix FindRandomizedRange(const ScoreMatrix& A, int size, int iter) { 81 | int nr = A.rows(), nc = A.cols(); 82 | ScoreMatrix L(nr, size); 83 | Eigen::FullPivLU lu1(nr, size); 84 | ScoreMatrix Q = ScoreMatrix::Random(nc, size); 85 | Eigen::FullPivLU lu2(nc, nr); 86 | 87 | // Normalized power iterations. Simply multiplying by A repeatedly makes alg unstable, so use LU to "normalize" 88 | for (int i = 0; i < iter; ++i) { 89 | lu1.compute(A * Q); 90 | L.setIdentity(); 91 | L.block(0, 0, nr, size).triangularView() = 92 | lu1.matrixLU(); 93 | 94 | lu2.compute(A.transpose() * L); 95 | Q.setIdentity(); 96 | Q.block(0, 0, nc, size).triangularView() = 97 | lu2.matrixLU(); 98 | } 99 | 100 | Eigen::ColPivHouseholderQR qr(A * Q); 101 | return qr.householderQ() * ScoreMatrix::Identity(nr, size); // recover skinny Q matrix 102 | } 103 | }; 104 | 105 | /* 106 | * Implementation of Robust PCA algorithm via Principal Component Pursuit 107 | * Separates a matrix into two-components: low-rank and sparse 108 | */ 109 | class RobustPca { 110 | private: 111 | ScoreMatrix M, L, S; 112 | ScoreFlt nr, nc, spe_norm, fro_norm, l1_norm, errmin, mul; 113 | ScoreFlt TOL = 1e-3; 114 | bool trans; 115 | 116 | public: 117 | RobustPca(const ScoreMatrix _M, const ScoreFlt _mul = 4.0) : 118 | M(_M), mul((_mul > 4) ? _mul : 4.0) { 119 | trans = (M.rows() < M.cols()); 120 | if (trans) { 121 | cout<<"! Transposing matrix"<(); // coefficient-wise l1 norm 133 | cout<< " RPCA L1 norm: "<() << endl; 134 | errmin = TOL * fro_norm; 135 | } 136 | 137 | ScoreMatrix LowRankComponent() { return (trans? L.transpose() : L ); } 138 | 139 | ScoreMatrix SparseComponent() { return (trans? S.transpose() : S ); } 140 | 141 | // Encourages low-rank by taking (truncated) SVD, then setting small singular values to zero 142 | int svd_truncate(const ScoreMatrix& X, int rank, ScoreFlt min_sv, ScoreMatrix& L) { 143 | RandomizedSvd rsvd(X, rank, (int)ceil(0.2*rank)+1, 1); 144 | ScoreVector s = rsvd.singularValues(); 145 | ScoreVector s0 = s.cwiseAbs() - min_sv * ScoreVector::Ones(s.size()); 146 | int nnz = (s0.array() > 0).count(); 147 | // cout<<"S: "<()) * lambda; 168 | ScoreMatrix Z = M / init_scale; 169 | ScoreMatrix M2; 170 | 171 | const int sv_step = std::max(1, (int)ceil(rank / maxiter)); 172 | int sv = sv_step; 173 | 174 | for (int i = 0; i < maxiter; ++i) { 175 | // cout<<" rank: "<() << endl; 188 | if (err < errmin) { 189 | break; 190 | } 191 | 192 | Z += mu * Zi; 193 | mu = std::min(mu * rho, mu_bar); 194 | } 195 | // mu = k * nc * nr / (4 * l1_norm); 196 | // M2 = M + Z / mu; 197 | // svd_truncate(M2 - S, rank, 1/mu, L); 198 | // shrink(M2 - L, lambda/mu, S); 199 | } 200 | 201 | ScoreMatrix fit_fixed(const ScoreMatrix B, const int maxiter = 10, const ScoreFlt k = 1.0) { 202 | int rank = B.cols(); // B: Vs_num * base_size 203 | ScoreFlt lambda = mul * rank / (sqrt(nr + nc)); 204 | ScoreFlt mu = k * nc * nr / (16 * l1_norm); 205 | ScoreFlt mu_bar = mu / TOL; 206 | ScoreFlt rho = k * 1.5; 207 | // ScoreFlt init_scale = std::max(spe_norm, M.lpNorm()) * lambda; 208 | // ScoreMatrix Z = M / init_scale; 209 | ScoreMatrix Z = ScoreMatrix::Zero(nr, nc); 210 | ScoreMatrix M2, Theta; 211 | 212 | RandomizedSvd Bsvd(B, rank, 0, sqrt(rank)); 213 | ScoreMatrix Binv = Bsvd.pinv(); // Binv: base_size * Vs_num 214 | 215 | for (int i = 0; i < maxiter; ++i) { 216 | // cout<<" r: "< " << L.lpNorm<1>() << " " << S.lpNorm<1>() << " " << Theta.lpNorm<1>(); 228 | // cout << " err: "<() << endl; 229 | if (err < TOL * fro_norm) { 230 | break; 231 | } 232 | 233 | Z += mu * Zi; 234 | mu = std::min(mu * rho, mu_bar); 235 | } 236 | return Binv * (M - S); 237 | } 238 | }; 239 | 240 | #endif // SCARA_FEATUREDECOMP_H 241 | -------------------------------------------------------------------------------- /demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Preparation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "n=19717, m=88648, F=500, C=3 | feat: (19717, 500), label: (19717,) | 60/600/19057=0.00/0.03/0.97\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "from data_processor import DataProcess\n", 25 | "# %load -r 323- Train/data_processor.py\n", 26 | "processor = DataProcess('pubmed', seed=0)\n", 27 | "processor.input(['adjtxt', 'attr_matrix', 'labels'])\n", 28 | "processor.calculate(['deg', 'idx_train', 'attr_matrix_norm'])\n", 29 | "processor.output(['deg', 'query', 'attr_matrix_norm'])\n", 30 | "print(processor)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "--------------------------------------------------------------------------------\n", 43 | "Configs:\n", 44 | "\tAlgorithm Parameter: clean_graph\n", 45 | "\tIs Undirected: 0\n", 46 | "\tInput Graph File: ../data/pubmed/adj.txt\n", 47 | "\tOutput Folder: ../data/pubmed\n", 48 | "--------------------------------------------------------------------------------\n", 49 | "88648 Lines Read.\n", 50 | "88648-th Non-Self Loop Edges.\n", 51 | "Finish Reading.\n", 52 | "--------------------------------------------------------------------------------\n", 53 | "Maximum ID: 19716\n", 54 | "Minimum ID: 0\n", 55 | "The number of dead end vertices: 0\n", 56 | "The number of Isolated Points: 0\n", 57 | "The maximum out degree is: 171\n", 58 | "Writing Binary Finished.\n", 59 | "--------------------------------------------------------------------------------\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "%%script bash\n", 65 | "DATASTR=pubmed\n", 66 | "DATADIR=../data/${DATASTR}\n", 67 | "../Precompute/build/featpush -algo clean_graph -is_undirected no \\\n", 68 | " -graph ${DATADIR}/adj.txt -output_folder ${DATADIR}" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "# Feat-Push" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "--------------------------------------------------------------------------------\n", 88 | "Configs:\n", 89 | "\tAlgorithm Parameter: featpush\n", 90 | "\tInput Meta File: ../data/pubmed/attribute.txt\n", 91 | "\tInput Graph Binary File: ../data/pubmed/graph.bin\n", 92 | "\tInput Query File: ../data/pubmed/query.txt\n", 93 | "\tFeature File: ../data/pubmed/feats_norm.npy\n", 94 | "\tEstimation Folder: ../save/pubmed/featpush/0\n", 95 | "\tNumber of Split: 1\n", 96 | "\tRandom Seed: 0\n", 97 | "\tAlpha: 0.100000000000\n", 98 | "\tEpsilon: 2.000000000000\n", 99 | "--------------------------------------------------------------------------------\n", 100 | "The Number of Vertices: 19717\n", 101 | "The Number of Edges: 88648\n", 102 | "Returned Value of fread: 88648\n", 103 | "The number of dead end vertices:0\n", 104 | "edges_processed:88648\n", 105 | "--------------------------------------------------------------------------------\n", 106 | "Query size: 19717\n", 107 | "Input shape: 19717 500\n", 108 | "Feature size: 19717 500\n", 109 | "num_of_walks:10000000\n", 110 | "Result size: 9858500 \n", 111 | "Feature saved: ../save/pubmed/featpush/0/score_0.1_2.npy\n", 112 | "Mem: 569 MB\n", 113 | "Total Time: 1.781927, Average: 0.003563853993\n", 114 | "--------------------------------------------------------------------------------\n" 115 | ] 116 | } 117 | ], 118 | "source": [ 119 | "%%script bash\n", 120 | "DATASTR=pubmed\n", 121 | "ALGOSTR=featpush\n", 122 | "SEED=0\n", 123 | "DATADIR=../data/${DATASTR}\n", 124 | "SAVEDIR=../save/${DATASTR}/${ALGOSTR}/${SEED}\n", 125 | "mkdir -p ${SAVEDIR}\n", 126 | "../Precompute/build/featpush -algo ${ALGOSTR} \\\n", 127 | " -meta ${DATADIR}/attribute.txt -graph_binary ${DATADIR}/graph.bin \\\n", 128 | " -query ${DATADIR}/query.txt -feature_file ${DATADIR}/feats_norm.npy \\\n", 129 | " -estimation_folder ${SAVEDIR} -split_num 1 -seed ${SEED} \\\n", 130 | " -alpha 0.1 -epsilon 2" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "# Feat-Reuse" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 4, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "--------------------------------------------------------------------------------\n", 150 | "Configs:\n", 151 | "\tAlgorithm Parameter: featreuse\n", 152 | "\tInput Meta File: ../data/pubmed/attribute.txt\n", 153 | "\tInput Graph Binary File: ../data/pubmed/graph.bin\n", 154 | "\tInput Query File: ../data/pubmed/query.txt\n", 155 | "\tFeature File: ../data/pubmed/feats_norm.npy\n", 156 | "\tEstimation Folder: ../save/pubmed/featreuse/0\n", 157 | "\tNumber of Split: 1\n", 158 | "\tRandom Seed: 0\n", 159 | "\tAlpha: 0.100000000000\n", 160 | "\tEpsilon: 2.000000000000\n", 161 | "--------------------------------------------------------------------------------\n", 162 | "The Number of Vertices: 19717\n", 163 | "The Number of Edges: 88648\n", 164 | "Returned Value of fread: 88648\n", 165 | "The number of dead end vertices:0\n", 166 | "edges_processed:88648\n", 167 | "--------------------------------------------------------------------------------\n", 168 | "Query size: 19717\n", 169 | "Input shape: 19717 500\n", 170 | "Feature size: 19717 500\n", 171 | "num_of_walks:10000000\n", 172 | "Result size: 9858500 \n", 173 | "base_size:20\n", 174 | "Time Used on Base 0.138076\n", 175 | "Feature saved: ../save/pubmed/featreuse/0/score_0.1_2.npy\n", 176 | "avg_tht:0.0141102\n", 177 | "avg_res:0.995987\n", 178 | "re_feat_num:490\n", 179 | "Mem: 570 MB\n", 180 | "Total Time: 1.096374, Average: 0.002192748003\n", 181 | "--------------------------------------------------------------------------------\n" 182 | ] 183 | } 184 | ], 185 | "source": [ 186 | "%%script bash\n", 187 | "DATASTR=pubmed\n", 188 | "ALGOSTR=featreuse\n", 189 | "SEED=0\n", 190 | "DATADIR=../data/${DATASTR}\n", 191 | "SAVEDIR=../save/${DATASTR}/${ALGOSTR}/${SEED}\n", 192 | "mkdir -p ${SAVEDIR}\n", 193 | "../Precompute/build/featpush -algo ${ALGOSTR} \\\n", 194 | " -meta ${DATADIR}/attribute.txt -graph_binary ${DATADIR}/graph.bin \\\n", 195 | " -query ${DATADIR}/query.txt -feature_file ${DATADIR}/feats_norm.npy \\\n", 196 | " -estimation_folder ${SAVEDIR} -split_num 1 -seed ${SEED} \\\n", 197 | " -alpha 0.1 -epsilon 2" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "# Train" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 5, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "name": "stdout", 214 | "output_type": "stream", 215 | "text": [ 216 | "--------------------\n", 217 | "Option saved.\n", 218 | "Config path: ../save/pubmed/featreuse/0/config.json\n", 219 | "Option dict: {'seed': 0, 'config': 'config/pubmed.json', 'dev': 0, 'data': 'pubmed', 'path': '../data/', 'algo': 'featreuse', 'epochs': 1000, 'patience': 100, 'batch': 64, 'lr': 0.005, 'weight_decay': 0.0001, 'layer': 2, 'hidden': 128, 'dropout': 0.5, 'bias': 'none', 'alpha': 0.1, 'eps': 2, 'rrz': 0.5, 'spt': 1}\n", 220 | "\n", 221 | "--------------------\n", 222 | "n=19717, m=88648, F_t=torch.Size([60, 500])\n", 223 | "n_train=torch.Size([60]), n_val=torch.Size([600]), n_test=torch.Size([19057])\n", 224 | "--------------------\n", 225 | "Epoch:0009 | train loss:0.3550, val acc:0.7867, cost:0.0168\n", 226 | "Epoch:0019 | train loss:0.1621, val acc:0.7783, cost:0.0286\n", 227 | "Epoch:0029 | train loss:0.0634, val acc:0.7783, cost:0.0403\n", 228 | "Epoch:0039 | train loss:0.1016, val acc:0.7850, cost:0.0520\n", 229 | "Epoch:0049 | train loss:0.0944, val acc:0.7733, cost:0.0638\n", 230 | "Epoch:0059 | train loss:0.0149, val acc:0.7850, cost:0.0755\n", 231 | "Epoch:0069 | train loss:0.0194, val acc:0.7933, cost:0.0873\n", 232 | "Epoch:0079 | train loss:0.0054, val acc:0.7950, cost:0.0993\n", 233 | "Epoch:0089 | train loss:0.0860, val acc:0.7900, cost:0.1111\n", 234 | "Epoch:0099 | train loss:0.0131, val acc:0.7983, cost:0.1229\n", 235 | "Epoch:0109 | train loss:0.0105, val acc:0.7967, cost:0.1347\n", 236 | "Epoch:0119 | train loss:0.0113, val acc:0.7950, cost:0.1464\n", 237 | "Epoch:0129 | train loss:0.0443, val acc:0.7967, cost:0.1587\n", 238 | "Epoch:0139 | train loss:0.1653, val acc:0.7983, cost:0.1704\n", 239 | "Epoch:0149 | train loss:0.0840, val acc:0.7967, cost:0.1824\n", 240 | "Epoch:0159 | train loss:0.0399, val acc:0.7967, cost:0.1942\n", 241 | "Epoch:0169 | train loss:0.0082, val acc:0.7950, cost:0.2061\n", 242 | "Train time cost: 0.2145\n", 243 | "Train best acc: 1.0000, Val best acc: 0.7983\n", 244 | "--------------------\n", 245 | "Test time cost: 0.3500, Memory: 5.080GB\n", 246 | "Best epoch: 76, Test acc: 0.7782\n", 247 | "--------------------\n" 248 | ] 249 | } 250 | ], 251 | "source": [ 252 | "!python -u run.py -f 0 -c config/pubmed.json -v 0" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [] 261 | } 262 | ], 263 | "metadata": { 264 | "kernelspec": { 265 | "display_name": "Python 3.9.12 ('torch19-py9')", 266 | "language": "python", 267 | "name": "python3" 268 | }, 269 | "language_info": { 270 | "codemirror_mode": { 271 | "name": "ipython", 272 | "version": 3 273 | }, 274 | "file_extension": ".py", 275 | "mimetype": "text/x-python", 276 | "name": "python", 277 | "nbconvert_exporter": "python", 278 | "pygments_lexer": "ipython3", 279 | "version": "3.9.12" 280 | }, 281 | "orig_nbformat": 4, 282 | "vscode": { 283 | "interpreter": { 284 | "hash": "d3845a1a74aefdaf9f167c78e35335e49a8028d27c0f04387a347581b7d851dd" 285 | } 286 | } 287 | }, 288 | "nbformat": 4, 289 | "nbformat_minor": 2 290 | } 291 | -------------------------------------------------------------------------------- /Precompute/fastPRNG.h: -------------------------------------------------------------------------------- 1 | //------------------------------------------------------------------------------ 2 | // Copyright (c) 2018-2020 Michele Morrone 3 | // All rights reserved. 4 | // 5 | // https://michelemorrone.eu - https://BrutPitt.com 6 | // 7 | // twitter: https://twitter.com/BrutPitt - github: https://github.com/BrutPitt/fastPRNG 8 | // 9 | // mailto:brutpitt@gmail.com - mailto:me@michelemorrone.eu 10 | // 11 | // This software is distributed under the terms of the BSD 2-Clause license 12 | //------------------------------------------------------------------------------ 13 | #pragma once 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | namespace fastPRNG { 21 | #define UNI_32BIT_INV 2.3283064365386962890625e-10 22 | #define VNI_32BIT_INV 4.6566128730773925781250e-10 // UNI_32BIT_INV * 2 23 | 24 | #define UNI_64BIT_INV 5.42101086242752217003726400434970e-20 25 | #define VNI_64BIT_INV 1.08420217248550443400745280086994e-19 // UNI_64BIT_INV * 2 26 | 27 | #define FPRNG_SEED_INIT64 std::chrono::system_clock::now().time_since_epoch().count() 28 | #define FPRNG_SEED_INIT32 FPRNG_SEED_INIT64 29 | 30 | inline static uint32_t splitMix32(const uint32_t val) { 31 | uint32_t z = val + 0x9e3779b9; 32 | z ^= z >> 15; // 16 for murmur3 33 | z *= 0x85ebca6b; 34 | z ^= z >> 13; 35 | z *= 0xc2b2ae35; 36 | return z ^ (z >> 16); 37 | } 38 | 39 | inline static uint64_t splitMix64(const uint64_t val) { 40 | uint64_t z = val + 0x9e3779b97f4a7c15; 41 | z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9; 42 | z = (z ^ (z >> 27)) * 0x94d049bb133111eb; 43 | return z ^ (z >> 31); 44 | } 45 | 46 | // 32/64 bit rotation func 47 | template inline static T rotl(const T x, const int k) { return (x << k) | (x >> (sizeof(T)*8 - k)); } // sizeof*8 is resolved to compile-time 48 | 49 | /*-------------------------------------------------------------------------- 50 | 64bit PRNG Algorithms: xoshiro / xoroshiro 51 | 52 | xoshiro256+ / xoshiro256++ / xoshiro256** 53 | xoroshiro128+ / xoroshiro128++ / xoroshiro128** 54 | 55 | Algorithms by David Blackman and Sebastiano Vigna 56 | http://prng.di.unimi.it/ 57 | 58 | To the extent possible under law, the author has dedicated all copyright 59 | and related and neighboring rights to this software to the public domain 60 | worldwide. This software is distributed without any warranty. 61 | 62 | See . 63 | -------------------------------------------------------------------------- */ 64 | #define XOSHIRO256\ 65 | const uint64_t t = s1 << 17;\ 66 | s2 ^= s0;\ 67 | s3 ^= s1;\ 68 | s1 ^= s2;\ 69 | s0 ^= s3;\ 70 | s2 ^= t;\ 71 | s3 = rotl(s3, 45);\ 72 | return result; 73 | 74 | #define XOROSHIRO128(A,B,C)\ 75 | s1 ^= s0;\ 76 | s0 = rotl(s0, A) ^ s1 ^ (s1 << B);\ 77 | s1 = rotl(s1, C);\ 78 | return result; 79 | 80 | #define XORSHIFT64\ 81 | s0 ^= s0 << 13;\ 82 | s0 ^= s0 >> 7;\ 83 | s0 ^= s0 << 17;\ 84 | return s0; 85 | 86 | #define XOSHIRO256_STATIC(FUNC)\ 87 | static const uint64_t seed = uint64_t(FPRNG_SEED_INIT64);\ 88 | static uint64_t s0 = splitMix64(seed), s1 = splitMix64(s0), s2 = splitMix64(s1), s3 = splitMix64(s2);\ 89 | FUNC; XOSHIRO256 90 | 91 | #define XOROSHIRO128_STATIC(FUNC, A, B, C)\ 92 | static const uint64_t seed = uint64_t(FPRNG_SEED_INIT64);\ 93 | static uint64_t s0 = splitMix64(seed), s1 = splitMix64(s0);\ 94 | FUNC; XOROSHIRO128(A,B,C) 95 | 96 | #define XORSHIFT64_STATIC\ 97 | static uint64_t s0 = uint64_t(FPRNG_SEED_INIT64);\ 98 | XORSHIFT64 99 | 100 | // 101 | // 64bit pseudo-random generator 102 | // All integer values are returned in interval [0, UINT64_MAX] 103 | // to get values between [INT64_MIN, INT64_MAX] just cast result to int64_t 104 | /////////////////////////////////////////////////////////////////////////////// 105 | class fastXS64 106 | { 107 | public: 108 | fastXS64(const uint64_t seedVal = uint64_t(FPRNG_SEED_INIT64)) { seed(seedVal); } 109 | 110 | inline uint64_t xoshiro256p() { return xoshiro256(s0 + s3); } 111 | inline uint64_t xoshiro256pp() { return xoshiro256(rotl(s0 + s3, 23) + s0); } 112 | inline uint64_t xoshiro256xx() { return xoshiro256(rotl(s1 * 5, 7) * 9); } 113 | 114 | template inline T xoshiro256p_UNI() { return T( xoshiro256p()) * UNI_64BIT_INV; } // _UNI returns value in [ 0, 1] with T ==> float/double 115 | template inline T xoshiro256p_VNI() { return T(int64_t(xoshiro256p())) * VNI_64BIT_INV; } // _VNI returns value in [-1, 1] with T ==> float/double 116 | template inline T xoshiro256p_Range(T min, T max) // _Range returns value in [min, max] with T ==> float/double 117 | { return min + (max-min) * xoshiro256p_UNI(); } 118 | 119 | inline uint64_t xoroshiro128p() { return xoroshiro128( s0 + s1); } 120 | inline uint64_t xoroshiro128pp() { return xoroshiro128(rotl(s0 + s1, 17) + s0, 49, 21, 28); } 121 | inline uint64_t xoroshiro128xx() { return xoroshiro128(rotl(s0 * 5, 7) * 9); } 122 | 123 | template inline T xoroshiro128p_UNI() { return T( xoshiro256p()) * UNI_64BIT_INV; } // _UNI returns value in [ 0, 1] with T ==> float/double 124 | template inline T xoroshiro128p_VNI() { return T(int64_t(xoshiro256p())) * VNI_64BIT_INV; } // _VNI returns value in [-1, 1] with T ==> float/double 125 | template inline T xoroshiro128p_Range(T min, T max) // _Range returns value in [min, max] with T ==> float/double 126 | { return min + (max-min) * xoroshiro128p_UNI(); } 127 | 128 | inline uint64_t xorShift() { XORSHIFT64 } // Marsaglia xorShift: period 2^64-1 129 | 130 | template inline T xorShift_UNI() { return xorShift() * UNI_64BIT_INV; } // _UNI returns value in [ 0, 1] with T ==> float/double 131 | template inline T xorShift_VNI() { return int64_t(xorShift()) * VNI_64BIT_INV; } // _VNI returns value in [-1, 1] with T ==> float/double 132 | template inline T xorShift_Range(T min, T max) // _Range returns value in [min, max] with T ==> float/double 133 | { return min + (max-min) * xorShift_UNI(); } 134 | 135 | void seed(const uint64_t seedVal = uint64_t(FPRNG_SEED_INIT64)) { 136 | s0 = splitMix64(seedVal); 137 | s1 = splitMix64(s0); 138 | s2 = splitMix64(s1); 139 | s3 = splitMix64(s2); 140 | } 141 | private: 142 | inline uint64_t xoshiro256(const uint64_t result) { XOSHIRO256 } 143 | inline uint64_t xoroshiro128(const uint64_t result, const int A = 24, const int B = 16, const int C = 37) { XOROSHIRO128(A,B,C) } 144 | 145 | uint64_t s0, s1, s2, s3; 146 | }; 147 | 148 | // fastXS64s - static members 149 | // you can call directly w/o declaration, but.. 150 | // N.B. all members/functions share same seed, and subsequents xor & shift 151 | // operations on it, if you need different seeds declare more 152 | // fastXS32 (non static) objects 153 | // 154 | // 64bit pseudo-random generator 155 | // All integer values are returned in interval [0, UINT64_MAX] 156 | // to get values between [INT64_MIN, INT64_MAX] just cast result to int64_t 157 | /////////////////////////////////////////////////////////////////////////////// 158 | class fastXS64s 159 | { 160 | public: 161 | fastXS64s() = default; 162 | 163 | inline static uint64_t xoshiro256p() { XOSHIRO256_STATIC(const uint64_t result = s0 + s3) } 164 | inline static uint64_t xoshiro256pp() { XOSHIRO256_STATIC(const uint64_t result = rotl(s0 + s3, 23) + s0) } 165 | inline static uint64_t xoshiro256xx() { XOSHIRO256_STATIC(const uint64_t result = rotl(s1 * 5, 7) * 9) } 166 | 167 | template inline static T xoshiro256p_UNI() { return T( xoshiro256p()) * UNI_64BIT_INV; } // _UNI returns value in [ 0, 1] with T ==> float/double 168 | template inline static T xoshiro256p_VNI() { return T(int64_t(xoshiro256p())) * VNI_64BIT_INV; } // _VNI returns value in [-1, 1] with T ==> float/double 169 | template inline static T xoshiro256p_Range(T min, T max) // _Range returns value in [min, max] with T ==> float/double 170 | { return min + (max-min) * xoshiro256p_UNI(); } 171 | 172 | inline static uint64_t xoroshiro128p() { XOROSHIRO128_STATIC(const uint64_t result = s0 + s1, 24, 13, 27) } 173 | inline static uint64_t xoroshiro128pp() { XOROSHIRO128_STATIC(const uint64_t result = rotl(s0 + s1, 17) + s0, 49, 21, 28) } 174 | inline static uint64_t xoroshiro128xx() { XOROSHIRO128_STATIC(const uint64_t result = rotl(s0 * 5, 7) * 9, 24, 13, 27) } 175 | 176 | template inline static T xoroshiro128p_UNI() { return T( xoshiro256p()) * UNI_64BIT_INV; } // _UNI returns value in [ 0, 1] with T ==> float/double 177 | template inline static T xoroshiro128p_VNI() { return T(int64_t(xoshiro256p())) * VNI_64BIT_INV; } // _VNI returns value in [-1, 1] with T ==> float/double 178 | template inline static T xoroshiro128p_Range(T min, T max) // _Range returns value in [min, max] with T ==> float/double 179 | { return min + (max-min) * xoroshiro128p_UNI(); } 180 | 181 | inline static uint64_t xorShift() { XORSHIFT64_STATIC } // Marsaglia xorShift: period 2^64-1 182 | 183 | template inline static T xorShift_UNI() { return xorShift() * UNI_64BIT_INV; } // _UNI returns value in [ 0, 1] with T ==> float/double 184 | template inline static T xorShift_VNI() { return int64_t(xorShift()) * VNI_64BIT_INV; } // _VNI returns value in [-1, 1] with T ==> float/double 185 | template inline static T xorShift_Range(T min, T max) // _Range returns value in [min, max] with T ==> float/double 186 | { return min + (max-min) * xorShift_UNI(); } 187 | }; 188 | 189 | #undef XOSHIRO256 190 | #undef XOROSHIRO128 191 | #undef XORSHIFT64 192 | #undef XOSHIRO256_STATIC 193 | #undef XOROSHIRO128_STATIC 194 | #undef XORSHIFT64_STATIC 195 | 196 | /*-------------------------------------------------------------------------- 197 | 64bit PRNG Algorithms: 198 | 199 | znew / wnew / MWC / CNG / FIB / XSH / KISS 200 | 201 | Originally written from George Marsaglia 202 | -------------------------------------------------------------------------- */ 203 | 204 | 205 | // fastRandom64Class 206 | // 207 | // 64bit pseudo-random generator 208 | // All values are returned in interval [0, UINT64_MAX] 209 | // to get values between [INT64_MIN, INT64_MAX] just cast result to int64_t 210 | /////////////////////////////////////////////////////////////////////////////// 211 | class fastRandom64Class 212 | { 213 | public: 214 | // no vaule, seed from system clock, or same seed for same sequence of numbers 215 | fastRandom64Class(const uint64_t seedVal = uint64_t(FPRNG_SEED_INIT64)) { reset(); seed(seedVal); } 216 | 217 | // re-seed the current state/values with a new random values 218 | void seed(const uint64_t seed = uint64_t(FPRNG_SEED_INIT64)) { 219 | uint64_t s[6]; 220 | s[0] = splitMix64(seed); 221 | for(int i=1; i<6; i++) s[i] = splitMix64(s[i-1]); 222 | initialize(s); 223 | } 224 | // reset to initial state 225 | void reset() { 226 | x=uint64_t(1234567890987654321ULL); c=uint64_t(123456123456123456ULL); 227 | y=uint64_t(362436362436362436ULL ); z=uint64_t(1066149217761810ULL ); 228 | a=uint64_t(224466889); b=uint64_t(7584631); 229 | } 230 | 231 | inline uint64_t MWC() { uint64_t t; return t=(x<<58)+c, c=(x>>6), x+=t, c+=(x>17), y^=(y<<43); } 234 | inline uint64_t FIB() { return (b=a+b),(a=b-a); } 235 | 236 | inline uint64_t KISS () { return MWC()+XSH()+CNG(); } //period 2^250 237 | 238 | template inline T KISS_UNI() { return KISS() * UNI_64BIT_INV; } // _UNI returns value in [ 0, 1] with T ==> float/double 239 | template inline T KISS_VNI() { return int64_t(KISS()) * VNI_64BIT_INV; } // _VNI returns value in [-1, 1] with T ==> float/double 240 | template inline T KISS_Range(T min, T max) // _Range returns value in [min, max] with T ==> float/double 241 | { return min + (max-min) * KISS_UNI(); } 242 | 243 | private: 244 | void initialize(const uint64_t *i){ x+=i[0]; y+=i[1]; z+=i[2]; c+=i[3]; a=+i[4]; b=+i[5]; } 245 | 246 | uint64_t x, c, y, z; 247 | uint64_t a, b; 248 | }; 249 | 250 | } // end of namespace FstRnd 251 | 252 | #undef UNI_32BIT_INV 253 | #undef VNI_32BIT_INV 254 | #undef UNI_64BIT_INV 255 | #undef VNI_64BIT_INV 256 | #undef FPRNG_SEED_INIT32 257 | #undef FPRNG_SEED_INIT64 258 | -------------------------------------------------------------------------------- /Train/data_processor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import scipy.sparse as sp 4 | from sklearn.preprocessing import StandardScaler 5 | from sklearn.model_selection import train_test_split 6 | 7 | 8 | NTRAIN_PER_CLASS = 20 9 | NVAL_PER_CLASS = NTRAIN_PER_CLASS * 10 10 | 11 | 12 | # ==================== 13 | def diag_sp(diag): 14 | """Diagonal array to scipy sparse diagonal matrix""" 15 | n = len(diag) 16 | return sp.dia_matrix((diag, [0]), shape=(n, n)) 17 | 18 | 19 | def matstd(m, with_mean=False): 20 | """Matrix standardization""" 21 | scaler = StandardScaler(with_mean=with_mean) 22 | m = scaler.fit_transform(m) 23 | return m 24 | 25 | 26 | def matnorm_inf_dual(m, axis=0): 27 | """Normalization of matrix, set positive/negative sum of column to 1 28 | """ 29 | pos = m.clip(min=0) 30 | possum = pos.sum(axis=axis) 31 | possum[possum == 0] = 1 # Avoid sum = 0 32 | pos = pos / possum 33 | 34 | neg = m.clip(max=0) 35 | negsum = - neg.sum(axis=axis) 36 | negsum[negsum == 0] = 1 # Avoid sum = 0 37 | neg = neg / negsum 38 | return (pos + neg) 39 | 40 | 41 | def split_random(seed, n, n_train, n_val): 42 | """Split index randomly""" 43 | np.random.seed(seed) 44 | rnd = np.random.permutation(n) 45 | 46 | train_idx = np.sort(rnd[:n_train]) 47 | val_idx = np.sort(rnd[n_train:n_train + n_val]) 48 | 49 | train_val_idx = np.concatenate((train_idx, val_idx)) 50 | test_idx = np.sort(np.setdiff1d(np.arange(n), train_val_idx)) 51 | return train_idx, val_idx, test_idx 52 | 53 | 54 | def split_label(seed, n, n_train_per_class, n_val, labels): 55 | """Split index with equal label in train set""" 56 | np.random.seed(seed) 57 | rnd = set(np.arange(n)) 58 | 59 | train_idx = np.array([], dtype=np.int) 60 | if labels.ndim == 1: 61 | lb_nonnan = labels[~np.isnan(labels)] 62 | nclass = int(lb_nonnan.max()) + 1 63 | for i in range(nclass): 64 | cdd = np.where(labels == i)[0] 65 | sz = min(n_train_per_class, len(cdd)) 66 | idxi = np.random.choice(cdd, size=sz, replace=False) 67 | train_idx = np.concatenate((train_idx, idxi)) 68 | else: 69 | nclass = labels.shape[1] 70 | for i in range(nclass): 71 | cdd = np.where(labels[:, i] > 0)[0] 72 | sz = min(n_train_per_class, len(cdd)) 73 | idxi = np.random.choice(cdd, size=sz, replace=False) 74 | train_idx = np.concatenate((train_idx, idxi)) 75 | 76 | train_idx = np.unique(train_idx.flatten(), axis=0) 77 | val_idx = np.array((list( rnd - set(train_idx) ))) 78 | val_idx = np.random.choice(val_idx, size=n_val, replace=False) 79 | val_idx = np.sort(val_idx) 80 | 81 | train_val_idx = np.concatenate((train_idx, val_idx)) 82 | test_idx = np.sort(np.setdiff1d(np.arange(n), train_val_idx)) 83 | return train_idx, val_idx, test_idx 84 | 85 | 86 | def split_stratify(seed, n, n_train, n_val, labels, idx=None): 87 | assert labels.ndim == 1, 'Only support 1D labels' 88 | if idx is None: 89 | idx = np.arange(n) 90 | train_idx, test_idx = train_test_split(idx, train_size=n_train, random_state=seed, stratify=labels) 91 | val_idx, test_idx = train_test_split(test_idx, train_size=n_val, random_state=seed, stratify=labels[test_idx]) 92 | return train_idx, val_idx, test_idx 93 | 94 | 95 | # ==================== 96 | class DataProcess(object): 97 | def __init__(self, name, path='../data/', rrz=0.5, seed=0) -> None: 98 | super().__init__() 99 | self.name = name 100 | self.path = path 101 | self.rrz = rrz 102 | self.seed = seed 103 | 104 | self._n = None 105 | self._m = None 106 | self._nfeat = None 107 | self._nclass = None 108 | 109 | self.adjnpz_path = self._get_path('adj.npz') 110 | self.adjtxt_path = self._get_path('adj.txt') 111 | self.degree_path = self._get_path('degree.npz') 112 | self.labels_path = self._get_path('labels.npz') 113 | self.query_path = self._get_path('query.txt') 114 | self.querytrain_path = self._get_path('query_train.txt') 115 | self.feats_path = self._get_path('feats.npy') 116 | self.featsnorm_path = self._get_path('feats_normt.npy') 117 | 118 | self.adj_matrix = None 119 | self.deg = None 120 | self.labels = None # Labels can be 1D array or 2D one hot 121 | self.idx_train = None 122 | self.idx_val = None 123 | self.idx_test = None 124 | self.attr_matrix = None 125 | self.attr_matrix_norm = None 126 | 127 | def _get_path(self, fname): 128 | return os.path.join(self.path, self.name, fname) 129 | 130 | @property 131 | def n(self): 132 | if self._n: 133 | return self._n 134 | if self.labels is None: 135 | self.input(['labels']) 136 | self._n = len(self.labels) # Len return shape[0] 137 | return self._n 138 | 139 | @property 140 | def n_train(self): 141 | return len(self.idx_train) 142 | 143 | @property 144 | def n_val(self): 145 | return len(self.idx_val) 146 | 147 | @property 148 | def n_test(self): 149 | return len(self.idx_test) 150 | 151 | @property 152 | def m(self): 153 | if self._m: 154 | return self._m 155 | # 1: use cache adj matrix 156 | if self.adj_matrix is not None: 157 | self._m = len(self.adj_matrix.data) 158 | # 2: use attribute file 159 | elif os.path.isfile(self._get_path('attribute.txt')): 160 | with open(self._get_path('attribute.txt'), 'r') as attr_f: 161 | nline = attr_f.readline().rstrip() 162 | mline = attr_f.readline().rstrip() 163 | self._m = int(''.join(filter(str.isdigit, mline))) 164 | # 3: count by wc -l 165 | else: 166 | import subprocess 167 | self._m = int(subprocess.check_output(["wc", "-l", self.adjtxt_path]).split()[0]) 168 | return self._m 169 | 170 | @property 171 | def nfeat(self): 172 | if self._nfeat: 173 | return self._nfeat 174 | if self.attr_matrix is None: 175 | self.input(['attr_matrix']) 176 | self._nfeat = self.attr_matrix.shape[1] 177 | return self._nfeat 178 | 179 | @property 180 | def nclass(self): 181 | if self._nclass: 182 | return self._nclass 183 | if self.labels is None: 184 | self.input(['labels']) 185 | # 1D array 186 | if self.labels.ndim == 1: 187 | # self._nclass = int(self.labels.max()) + 1 188 | lb_nonnan = self.labels[~np.isnan(self.labels)] 189 | self._nclass = int(lb_nonnan.max()) + 1 190 | # 2D one hot 191 | else: 192 | self._nclass = self.labels.shape[1] 193 | return self._nclass 194 | 195 | def __str__(self): 196 | s = f"n={self.n}, m={self.m}, F={self.nfeat}, C={self.nclass} | " 197 | s += f"feat: {self.attr_matrix.shape}, label: {self.labels.shape} | " 198 | s += f"{self.n_train}/{self.n_val}/{self.n_test}=" 199 | s += f"{self.n_train/self.n:0.2f}/{self.n_val/self.n:0.2f}/{self.n_test/self.n:0.2f}" 200 | return s 201 | 202 | def calculate(self, lst): 203 | for key in lst: 204 | if key == 'deg': 205 | assert self.adj_matrix is not None 206 | self.deg = self.adj_matrix.sum(1).A1 207 | elif key in ['idx_train', 'idx_val', 'idx_test']: 208 | n_train = NTRAIN_PER_CLASS * self.nclass 209 | n_val = NVAL_PER_CLASS * self.nclass 210 | if 'paper' in self.name: 211 | np.random.seed(self.seed) 212 | self.input(['idx_train', 'idx_val', 'idx_test', 'labels']) 213 | 214 | # rnd = np.concatenate((self.idx_train, self.idx_val, self.idx_test)) 215 | # rnd = np.random.permutation(rnd) 216 | # self.idx_train = np.sort(rnd[:n_train]) 217 | # self.idx_val = np.sort(rnd[n_train:n_train + n_val]) 218 | # self.idx_test = np.sort(rnd[n_train + n_val:]) 219 | 220 | idx_all = np.concatenate((self.idx_train, self.idx_val, self.idx_test)) 221 | self.idx_train, self.idx_val, self.idx_test = split_stratify(self.seed, len(idx_all), self.n_train, self.n_val, self.labels[idx_all]) 222 | elif 'mag' in self.name: 223 | self.idx_train, self.idx_val, self.idx_test = split_label(self.seed, self.n, NTRAIN_PER_CLASS * 50, n_val // 4, self.labels) 224 | # self.idx_train, self.idx_val, self.idx_test = split_stratify(self.seed, self.n, n_train * 5, n_val, self.labels) 225 | else: 226 | # self.idx_train, self.idx_val, self.idx_test = split_random(self.seed, self.n, n_train, n_val) 227 | # self.idx_train, self.idx_val, self.idx_test = split_label(self.seed, self.n, NTRAIN_PER_CLASS, n_val, self.labels) 228 | self.idx_train, self.idx_val, self.idx_test = split_stratify(self.seed, self.n, n_train, n_val, self.labels) 229 | elif key == 'labels_oh': 230 | if self.labels.ndim == 2: 231 | self.labels_oh = self.labels 232 | else: 233 | self.labels_oh = np.zeros((self.n, self.nclass), dtype=np.int8) 234 | idx = ~ np.isnan(self.labels) 235 | row = np.arange(self.labels.size) 236 | self.labels_oh[row[idx], self.labels[idx]] = 1 237 | elif key == 'role': 238 | self.role = {} 239 | self.role['tr'] = self.idx_train.tolist() 240 | self.role['va'] = self.idx_val.tolist() 241 | self.role['te'] = self.idx_test.tolist() 242 | elif key == 'attr_matrix_norm': 243 | assert self.attr_matrix is not None 244 | assert self.deg is not None 245 | deg_pow = np.power(np.maximum(self.deg, 1e-12), 1 - self.rrz) 246 | deg_pow = diag_sp(deg_pow).astype(np.float32) 247 | self.attr_matrix_norm = deg_pow @ matstd(self.attr_matrix) # [n, F] 248 | self.attr_matrix_norm = matnorm_inf_dual(self.attr_matrix_norm).astype(np.float32) 249 | self.attr_matrix_norm = self.attr_matrix_norm.transpose().astype(np.float32, order='C') # [F, n] 250 | else: 251 | print("Key not exist: {}".format(key)) 252 | 253 | def input(self, lst): 254 | for key in lst: 255 | if key == 'adjnpz': 256 | self.adj_matrix = sp.load_npz(self.adjnpz_path) 257 | elif key == 'adjtxt': 258 | with open(self.adjtxt_path, 'r') as attr_f: 259 | nline = attr_f.readline().rstrip() 260 | self._n = int(''.join(filter(str.isdigit, nline))) 261 | adjtxt = np.loadtxt(self.adjtxt_path) 262 | self._m = adjtxt.shape[0] 263 | ones = np.ones((self.m), dtype=np.int8) 264 | self.adj_matrix = sp.coo_matrix( 265 | (ones, (adjtxt[:, 0], adjtxt[:, 1])), 266 | shape=(self.n, self.n)) 267 | self.adj_matrix = self.adj_matrix.tocsr() 268 | elif key == 'deg': 269 | self.deg = dict(np.load(self.degree_path))['arr_0'] 270 | elif key == 'labels': 271 | self.labels = dict(np.load(self.labels_path, allow_pickle=True))['labels'] 272 | elif key == 'idx_train': 273 | self.idx_train = dict(np.load(self.labels_path, allow_pickle=True))['idx_train'] 274 | elif key == 'idx_val': 275 | self.idx_val = dict(np.load(self.labels_path, allow_pickle=True))['idx_val'] 276 | elif key == 'idx_test': 277 | self.idx_test = dict(np.load(self.labels_path, allow_pickle=True))['idx_test'] 278 | elif key == 'attr_matrix': 279 | self.attr_matrix = np.load(self.feats_path) 280 | elif key == 'attr_matrix_norm': 281 | self.attr_matrix_norm = np.load(self.featsnorm_path) 282 | else: 283 | print("Key not exist: {}".format(key)) 284 | 285 | def output(self, lst): 286 | for key in lst: 287 | if key == 'adjnpz': 288 | self.adj_matrix = self.adj_matrix.tocsr() 289 | assert sp.isspmatrix_csr(self.adj_matrix) 290 | sp.save_npz(self.adjnpz_path, self.adj_matrix) 291 | elif key == 'adjtxt': 292 | self.adj_matrix = self.adj_matrix.tocoo() 293 | with open(self.adjtxt_path, 'w') as f: 294 | f.write("# {:d}\n".format(self.n)) 295 | for i in range(self.m): 296 | f.write("{:d} {:d}\n".format(self.adj_matrix.row[i], self.adj_matrix.col[i])) 297 | self.adj_matrix = self.adj_matrix.tocsr() 298 | elif key == 'deg': 299 | np.savez_compressed(self.degree_path, self.deg) 300 | elif key in ['labels', 'idx_train', 'idx_val', 'idx_test']: 301 | labels_dict = {'labels': self.labels, 302 | 'idx_train': self.idx_train, 303 | 'idx_val': self.idx_val, 304 | 'idx_test': self.idx_test} 305 | np.savez_compressed(self.labels_path, **labels_dict) 306 | elif key == 'query': 307 | query = np.arange(self.n, dtype=int) 308 | np.savetxt(self.query_path, query, fmt='%d', delimiter='\n') 309 | elif key == 'query_train': 310 | assert self.idx_train is not None 311 | np.savetxt(self.querytrain_path, self.idx_train, fmt='%d', delimiter='\n') 312 | elif key == 'attr_matrix': 313 | self.attr_matrix = self.attr_matrix.astype(np.float32, order='C') 314 | np.save(self.feats_path, self.attr_matrix) 315 | elif key == 'attr_matrix_norm': 316 | self.attr_matrix_norm = self.attr_matrix_norm.astype(np.float32, order='C') 317 | np.save(self.featsnorm_path, self.attr_matrix_norm) 318 | else: 319 | print("Key not exist: {}".format(key)) 320 | 321 | def output_split(self, attr_matrix, spt=10, name='feats'): 322 | """Split large matrix by feature dimension.""" 323 | from tqdm import trange 324 | n = attr_matrix.shape[0] 325 | nd = n // spt 326 | for i in trange(spt): 327 | if i < spt - 1: 328 | idxl, idxr = i * nd, (i+1) * nd 329 | else: 330 | idxl, idxr = i * nd, n 331 | prt = attr_matrix[idxl:idxr, :] 332 | 333 | prt_path = self._get_path('{}_{}.npy'.format(name, i)) 334 | np.save(prt_path, prt) 335 | 336 | 337 | if __name__ == '__main__': 338 | processor = DataProcess('pubmed', seed=0) 339 | processor.input(['adjtxt', 'attr_matrix', 'labels']) 340 | processor.calculate(['deg', 'idx_train', 'attr_matrix_norm']) 341 | processor.output(['deg', 'query', 'attr_matrix_norm']) 342 | print(processor) 343 | -------------------------------------------------------------------------------- /Precompute/npy.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Fork from: https://github.com/llohse/libnpy/ 3 | Copyright 2017 Leon Merten Lohse 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | */ 23 | 24 | #ifndef NPY_HPP_ 25 | #define NPY_HPP_ 26 | 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | 45 | 46 | namespace npy { 47 | 48 | /* Compile-time test for byte order. 49 | If your compiler does not define these per default, you may want to define 50 | one of these constants manually. 51 | Defaults to little endian order. */ 52 | #if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN || \ 53 | defined(__BIG_ENDIAN__) || \ 54 | defined(__ARMEB__) || \ 55 | defined(__THUMBEB__) || \ 56 | defined(__AARCH64EB__) || \ 57 | defined(_MIBSEB) || defined(__MIBSEB) || defined(__MIBSEB__) 58 | const bool big_endian = true; 59 | #else 60 | const bool big_endian = false; 61 | #endif 62 | 63 | 64 | const char magic_string[] = "\x93NUMPY"; 65 | const size_t magic_string_length = 6; 66 | 67 | const char little_endian_char = '<'; 68 | const char big_endian_char = '>'; 69 | const char no_endian_char = '|'; 70 | 71 | constexpr std::array 72 | endian_chars = {little_endian_char, big_endian_char, no_endian_char}; 73 | constexpr std::array 74 | numtype_chars = {'f', 'i', 'u', 'c'}; 75 | 76 | constexpr char host_endian_char = (big_endian ? 77 | big_endian_char : 78 | little_endian_char); 79 | 80 | /* npy array length */ 81 | typedef unsigned long int ndarray_len_t; 82 | 83 | typedef std::pair version_t; 84 | 85 | struct dtype_t { 86 | const char byteorder; 87 | const char kind; 88 | const unsigned int itemsize; 89 | 90 | // TODO(llohse): implement as constexpr 91 | inline std::string str() const { 92 | const size_t max_buflen = 16; 93 | char buf[max_buflen]; 94 | std::snprintf(buf, max_buflen, "%c%c%u", byteorder, kind, itemsize); 95 | return std::string(buf); 96 | } 97 | 98 | inline std::tuple tie() const { 99 | return std::tie(byteorder, kind, itemsize); 100 | } 101 | }; 102 | 103 | 104 | struct header_t { 105 | const dtype_t dtype; 106 | const bool fortran_order; 107 | const std::vector shape; 108 | }; 109 | 110 | inline void write_magic(std::ostream &ostream, version_t version) { 111 | ostream.write(magic_string, magic_string_length); 112 | ostream.put(version.first); 113 | ostream.put(version.second); 114 | } 115 | 116 | inline version_t read_magic(std::istream &istream) { 117 | char buf[magic_string_length + 2]; 118 | istream.read(buf, magic_string_length + 2); 119 | 120 | if (!istream) { 121 | throw std::runtime_error("io error: failed reading file"); 122 | } 123 | 124 | if (0 != std::memcmp(buf, magic_string, magic_string_length)) 125 | throw std::runtime_error("this file does not have a valid npy format."); 126 | 127 | version_t version; 128 | version.first = buf[magic_string_length]; 129 | version.second = buf[magic_string_length + 1]; 130 | 131 | return version; 132 | } 133 | 134 | const std::unordered_map dtype_map = { 135 | {std::type_index(typeid(float)), {host_endian_char, 'f', sizeof(float)}}, 136 | {std::type_index(typeid(double)), {host_endian_char, 'f', sizeof(double)}}, 137 | {std::type_index(typeid(long double)), {host_endian_char, 'f', sizeof(long double)}}, 138 | {std::type_index(typeid(char)), {no_endian_char, 'i', sizeof(char)}}, 139 | {std::type_index(typeid(signed char)), {no_endian_char, 'i', sizeof(signed char)}}, 140 | {std::type_index(typeid(short)), {host_endian_char, 'i', sizeof(short)}}, 141 | {std::type_index(typeid(int)), {host_endian_char, 'i', sizeof(int)}}, 142 | {std::type_index(typeid(long)), {host_endian_char, 'i', sizeof(long)}}, 143 | {std::type_index(typeid(long long)), {host_endian_char, 'i', sizeof(long long)}}, 144 | {std::type_index(typeid(unsigned char)), {no_endian_char, 'u', sizeof(unsigned char)}}, 145 | {std::type_index(typeid(unsigned short)), {host_endian_char, 'u', sizeof(unsigned short)}}, 146 | {std::type_index(typeid(unsigned int)), {host_endian_char, 'u', sizeof(unsigned int)}}, 147 | {std::type_index(typeid(unsigned long)), {host_endian_char, 'u', sizeof(unsigned long)}}, 148 | {std::type_index(typeid(unsigned long long)), {host_endian_char, 'u', sizeof(unsigned long long)}}, 149 | {std::type_index(typeid(std::complex)), {host_endian_char, 'c', sizeof(std::complex)}}, 150 | {std::type_index(typeid(std::complex)), {host_endian_char, 'c', sizeof(std::complex)}}, 151 | {std::type_index(typeid(std::complex)), {host_endian_char, 'c', sizeof(std::complex)}} 152 | }; 153 | 154 | 155 | // helpers 156 | inline bool is_digits(const std::string &str) { 157 | return std::all_of(str.begin(), str.end(), ::isdigit); 158 | } 159 | 160 | template 161 | inline bool in_array(T val, const std::array &arr) { 162 | return std::find(std::begin(arr), std::end(arr), val) != std::end(arr); 163 | } 164 | 165 | inline dtype_t parse_descr(std::string typestring) { 166 | if (typestring.length() < 3) { 167 | throw std::runtime_error("invalid typestring (length)"); 168 | } 169 | 170 | char byteorder_c = typestring.at(0); 171 | char kind_c = typestring.at(1); 172 | std::string itemsize_s = typestring.substr(2); 173 | 174 | if (!in_array(byteorder_c, endian_chars)) { 175 | throw std::runtime_error("invalid typestring (byteorder)"); 176 | } 177 | 178 | if (!in_array(kind_c, numtype_chars)) { 179 | throw std::runtime_error("invalid typestring (kind)"); 180 | } 181 | 182 | if (!is_digits(itemsize_s)) { 183 | throw std::runtime_error("invalid typestring (itemsize)"); 184 | } 185 | unsigned int itemsize = std::stoul(itemsize_s); 186 | 187 | return {byteorder_c, kind_c, itemsize}; 188 | } 189 | 190 | namespace pyparse { 191 | 192 | /** 193 | Removes leading and trailing whitespaces 194 | */ 195 | inline std::string trim(const std::string &str) { 196 | const std::string whitespace = " \t"; 197 | auto begin = str.find_first_not_of(whitespace); 198 | 199 | if (begin == std::string::npos) 200 | return ""; 201 | 202 | auto end = str.find_last_not_of(whitespace); 203 | 204 | return str.substr(begin, end - begin + 1); 205 | } 206 | 207 | 208 | inline std::string get_value_from_map(const std::string &mapstr) { 209 | size_t sep_pos = mapstr.find_first_of(":"); 210 | if (sep_pos == std::string::npos) 211 | return ""; 212 | 213 | std::string tmp = mapstr.substr(sep_pos + 1); 214 | return trim(tmp); 215 | } 216 | 217 | /** 218 | Parses the string representation of a Python dict 219 | 220 | The keys need to be known and may not appear anywhere else in the data. 221 | */ 222 | inline std::unordered_map parse_dict(std::string in, const std::vector &keys) { 223 | std::unordered_map map; 224 | 225 | if (keys.size() == 0) 226 | return map; 227 | 228 | in = trim(in); 229 | 230 | // unwrap dictionary 231 | if ((in.front() == '{') && (in.back() == '}')) 232 | in = in.substr(1, in.length() - 2); 233 | else 234 | throw std::runtime_error("Not a Python dictionary."); 235 | 236 | std::vector > positions; 237 | 238 | for (auto const &value : keys) { 239 | size_t pos = in.find("'" + value + "'"); 240 | 241 | if (pos == std::string::npos) 242 | throw std::runtime_error("Missing '" + value + "' key."); 243 | 244 | std::pair position_pair{pos, value}; 245 | positions.push_back(position_pair); 246 | } 247 | 248 | // sort by position in dict 249 | std::sort(positions.begin(), positions.end()); 250 | 251 | for (size_t i = 0; i < positions.size(); ++i) { 252 | std::string raw_value; 253 | size_t begin{positions[i].first}; 254 | size_t end{std::string::npos}; 255 | 256 | std::string key = positions[i].second; 257 | 258 | if (i + 1 < positions.size()) 259 | end = positions[i + 1].first; 260 | 261 | raw_value = in.substr(begin, end - begin); 262 | 263 | raw_value = trim(raw_value); 264 | 265 | if (raw_value.back() == ',') 266 | raw_value.pop_back(); 267 | 268 | map[key] = get_value_from_map(raw_value); 269 | } 270 | 271 | return map; 272 | } 273 | 274 | /** 275 | Parses the string representation of a Python boolean 276 | */ 277 | inline bool parse_bool(const std::string &in) { 278 | if (in == "True") 279 | return true; 280 | if (in == "False") 281 | return false; 282 | 283 | throw std::runtime_error("Invalid python boolean."); 284 | } 285 | 286 | /** 287 | Parses the string representation of a Python str 288 | */ 289 | inline std::string parse_str(const std::string &in) { 290 | if ((in.front() == '\'') && (in.back() == '\'')) 291 | return in.substr(1, in.length() - 2); 292 | 293 | throw std::runtime_error("Invalid python string."); 294 | } 295 | 296 | /** 297 | Parses the string representation of a Python tuple into a vector of its items 298 | */ 299 | inline std::vector parse_tuple(std::string in) { 300 | std::vector v; 301 | const char seperator = ','; 302 | 303 | in = trim(in); 304 | 305 | if ((in.front() == '(') && (in.back() == ')')) 306 | in = in.substr(1, in.length() - 2); 307 | else 308 | throw std::runtime_error("Invalid Python tuple."); 309 | 310 | std::istringstream iss(in); 311 | 312 | for (std::string token; std::getline(iss, token, seperator);) { 313 | v.push_back(token); 314 | } 315 | 316 | return v; 317 | } 318 | 319 | template 320 | inline std::string write_tuple(const std::vector &v) { 321 | if (v.size() == 0) 322 | return "()"; 323 | 324 | std::ostringstream ss; 325 | 326 | if (v.size() == 1) { 327 | ss << "(" << v.front() << ",)"; 328 | } else { 329 | const std::string delimiter = ", "; 330 | // v.size() > 1 331 | ss << "("; 332 | std::copy(v.begin(), v.end() - 1, std::ostream_iterator(ss, delimiter.c_str())); 333 | ss << v.back(); 334 | ss << ")"; 335 | } 336 | 337 | return ss.str(); 338 | } 339 | 340 | inline std::string write_boolean(bool b) { 341 | if (b) 342 | return "True"; 343 | else 344 | return "False"; 345 | } 346 | 347 | } // namespace pyparse 348 | 349 | 350 | inline header_t parse_header(std::string header) { 351 | /* 352 | The first 6 bytes are a magic string: exactly "x93NUMPY". 353 | The next 1 byte is an unsigned byte: the major version number of the file format, e.g. x01. 354 | The next 1 byte is an unsigned byte: the minor version number of the file format, e.g. x00. Note: the version of the file format is not tied to the version of the numpy package. 355 | The next 2 bytes form a little-endian unsigned short int: the length of the header data HEADER_LEN. 356 | The next HEADER_LEN bytes form the header data describing the array's format. It is an ASCII string which contains a Python literal expression of a dictionary. It is terminated by a newline ('n') and padded with spaces ('x20') to make the total length of the magic string + 4 + HEADER_LEN be evenly divisible by 16 for alignment purposes. 357 | The dictionary contains three keys: 358 | 359 | "descr" : dtype.descr 360 | An object that can be passed as an argument to the numpy.dtype() constructor to create the array's dtype. 361 | "fortran_order" : bool 362 | Whether the array data is Fortran-contiguous or not. Since Fortran-contiguous arrays are a common form of non-C-contiguity, we allow them to be written directly to disk for efficiency. 363 | "shape" : tuple of int 364 | The shape of the array. 365 | For repeatability and readability, this dictionary is formatted using pprint.pformat() so the keys are in alphabetic order. 366 | */ 367 | 368 | // remove trailing newline 369 | if (header.back() != '\n') 370 | throw std::runtime_error("invalid header"); 371 | header.pop_back(); 372 | 373 | // parse the dictionary 374 | std::vector keys{"descr", "fortran_order", "shape"}; 375 | auto dict_map = npy::pyparse::parse_dict(header, keys); 376 | 377 | if (dict_map.size() == 0) 378 | throw std::runtime_error("invalid dictionary in header"); 379 | 380 | std::string descr_s = dict_map["descr"]; 381 | std::string fortran_s = dict_map["fortran_order"]; 382 | std::string shape_s = dict_map["shape"]; 383 | 384 | std::string descr = npy::pyparse::parse_str(descr_s); 385 | dtype_t dtype = parse_descr(descr); 386 | 387 | // convert literal Python bool to C++ bool 388 | bool fortran_order = npy::pyparse::parse_bool(fortran_s); 389 | 390 | // parse the shape tuple 391 | auto shape_v = npy::pyparse::parse_tuple(shape_s); 392 | 393 | std::vector shape; 394 | for (auto item : shape_v) { 395 | ndarray_len_t dim = static_cast(std::stoul(item)); 396 | shape.push_back(dim); 397 | } 398 | 399 | return {dtype, fortran_order, shape}; 400 | } 401 | 402 | 403 | inline std::string 404 | write_header_dict(const std::string &descr, bool fortran_order, const std::vector &shape) { 405 | std::string s_fortran_order = npy::pyparse::write_boolean(fortran_order); 406 | std::string shape_s = npy::pyparse::write_tuple(shape); 407 | 408 | return "{'descr': '" + descr + "', 'fortran_order': " + s_fortran_order + ", 'shape': " + shape_s + ", }"; 409 | } 410 | 411 | inline void write_header(std::ostream &out, const header_t &header) { 412 | std::string header_dict = write_header_dict(header.dtype.str(), header.fortran_order, header.shape); 413 | 414 | size_t length = magic_string_length + 2 + 2 + header_dict.length() + 1; 415 | 416 | version_t version{1, 0}; 417 | if (length >= 255 * 255) { 418 | length = magic_string_length + 2 + 4 + header_dict.length() + 1; 419 | version = {2, 0}; 420 | } 421 | size_t padding_len = 16 - length % 16; 422 | std::string padding(padding_len, ' '); 423 | 424 | // write magic 425 | write_magic(out, version); 426 | 427 | // write header length 428 | if (version == version_t{1, 0}) { 429 | uint8_t header_len_le16[2]; 430 | uint16_t header_len = static_cast(header_dict.length() + padding.length() + 1); 431 | 432 | header_len_le16[0] = (header_len >> 0) & 0xff; 433 | header_len_le16[1] = (header_len >> 8) & 0xff; 434 | out.write(reinterpret_cast(header_len_le16), 2); 435 | } else { 436 | uint8_t header_len_le32[4]; 437 | uint32_t header_len = static_cast(header_dict.length() + padding.length() + 1); 438 | 439 | header_len_le32[0] = (header_len >> 0) & 0xff; 440 | header_len_le32[1] = (header_len >> 8) & 0xff; 441 | header_len_le32[2] = (header_len >> 16) & 0xff; 442 | header_len_le32[3] = (header_len >> 24) & 0xff; 443 | out.write(reinterpret_cast(header_len_le32), 4); 444 | } 445 | 446 | out << header_dict << padding << '\n'; 447 | } 448 | 449 | inline std::string read_header(std::istream &istream) { 450 | // check magic bytes an version number 451 | version_t version = read_magic(istream); 452 | 453 | uint32_t header_length; 454 | if (version == version_t{1, 0}) { 455 | uint8_t header_len_le16[2]; 456 | istream.read(reinterpret_cast(header_len_le16), 2); 457 | header_length = (header_len_le16[0] << 0) | (header_len_le16[1] << 8); 458 | 459 | if ((magic_string_length + 2 + 2 + header_length) % 16 != 0) { 460 | // TODO(llohse): display warning 461 | } 462 | } else if (version == version_t{2, 0}) { 463 | uint8_t header_len_le32[4]; 464 | istream.read(reinterpret_cast(header_len_le32), 4); 465 | 466 | header_length = (header_len_le32[0] << 0) | (header_len_le32[1] << 8) 467 | | (header_len_le32[2] << 16) | (header_len_le32[3] << 24); 468 | 469 | if ((magic_string_length + 2 + 4 + header_length) % 16 != 0) { 470 | // TODO(llohse): display warning 471 | } 472 | } else { 473 | throw std::runtime_error("unsupported file format version"); 474 | } 475 | 476 | auto buf_v = std::vector(); 477 | buf_v.reserve(header_length); 478 | istream.read(buf_v.data(), header_length); 479 | std::string header(buf_v.data(), header_length); 480 | 481 | return header; 482 | } 483 | 484 | inline ndarray_len_t comp_size(const std::vector &shape) { 485 | ndarray_len_t size = 1; 486 | for (ndarray_len_t i : shape) 487 | size *= i; 488 | 489 | return size; 490 | } 491 | 492 | template 493 | inline void 494 | SaveArrayAsNumpy(const std::string &filename, bool fortran_order, unsigned int n_dims, const unsigned long shape[], 495 | const Scalar* data) { 496 | // static_assert(has_typestring::value, "scalar type not understood"); 497 | const dtype_t dtype = dtype_map.at(std::type_index(typeid(Scalar))); 498 | 499 | std::ofstream stream(filename, std::ofstream::binary); 500 | if (!stream) { 501 | throw std::runtime_error("io error: failed to open a file."); 502 | } 503 | 504 | std::vector shape_v(shape, shape + n_dims); 505 | header_t header{dtype, fortran_order, shape_v}; 506 | write_header(stream, header); 507 | 508 | auto size = static_cast(comp_size(shape_v)); 509 | 510 | stream.write(reinterpret_cast(data), sizeof(Scalar) * size); 511 | } 512 | 513 | template 514 | inline void 515 | SaveArrayAsNumpy(const std::string &filename, bool fortran_order, unsigned int n_dims, const unsigned long shape[], 516 | const std::vector &data) { 517 | SaveArrayAsNumpy(filename, fortran_order, n_dims, shape, data.data()); 518 | } 519 | 520 | template 521 | inline void 522 | LoadArrayFromNumpy(const std::string &filename, std::vector &shape, std::vector &data) { 523 | bool fortran_order; 524 | LoadArrayFromNumpy(filename, shape, fortran_order, data); 525 | } 526 | 527 | template 528 | inline void LoadArrayFromNumpy(const std::string &filename, std::vector &shape, bool &fortran_order, 529 | std::vector &data) { 530 | std::ifstream stream(filename, std::ifstream::binary); 531 | if (!stream) { 532 | throw std::runtime_error("io error: failed to open a file."); 533 | } 534 | 535 | std::string header_s = read_header(stream); 536 | 537 | // parse header 538 | header_t header = parse_header(header_s); 539 | 540 | // check if the typestring matches the given one 541 | // static_assert(has_typestring::value, "scalar type not understood"); 542 | const dtype_t dtype = dtype_map.at(std::type_index(typeid(Scalar))); 543 | 544 | if (header.dtype.tie() != dtype.tie()) { 545 | throw std::runtime_error("formatting error: typestrings not matching"); 546 | } 547 | 548 | shape = header.shape; 549 | fortran_order = header.fortran_order; 550 | 551 | // compute the data size based on the shape 552 | auto size = static_cast(comp_size(shape)); 553 | data.resize(size); 554 | 555 | // read the data 556 | stream.read(reinterpret_cast(data.data()), sizeof(Scalar) * size); 557 | } 558 | 559 | } // namespace npy 560 | 561 | #endif // NPY_HPP_ -------------------------------------------------------------------------------- /Precompute/FeatureOp.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Implementation of embedding computation from feature 3 | Author: nyLiao 4 | */ 5 | #include 6 | #include 7 | #include "Graph.h" 8 | #include "SpeedPPR.h" 9 | #include "HelperFunctions.h" 10 | #include "FeatureOp.h" 11 | #ifdef ENABLE_RW 12 | #include "BatchRandomWalk.h" 13 | #endif 14 | 15 | 16 | // Wrapper processor class: feat-push 17 | class FeatProc { 18 | protected: 19 | Param ¶m; 20 | Graph &graph; 21 | class SpeedPPR ppr; 22 | IntVector Vt_nodes; // list of queried nodes 23 | My2DVector feat_v2d; // feature matrix data 24 | MyMatrix feat_matrix; // feature matrix mapped as vec of vec 25 | #ifdef ENABLE_RW 26 | WalkCache walkCache; 27 | #endif 28 | 29 | NInt thread_num; // number of threads 30 | ScoreFlt epsilon; 31 | ScoreFlt alpha; 32 | ScoreFlt lower_threshold; 33 | 34 | public: 35 | NInt V_num; // number of vertices 36 | NInt Vt_num; // number of queried nodes 37 | NInt feat_size; // size of feature 38 | NInt thd_size; // size of feature per thread 39 | // statistics 40 | double time_total = 0; 41 | std::vector time_read, time_write, time_push; 42 | std::vector time_init, time_fp, time_it, time_rw; 43 | 44 | protected: 45 | 46 | void push_one(const NInt i, const NInt tid, SpeedPPR::GStruct &_gstruct) { 47 | // printf("ID: %4" IDFMT "\n", i); 48 | double time_start = getCurrentTime(); 49 | #ifdef ENABLE_RW 50 | if (param.index) 51 | ppr.calc_ppr_cache(_gstruct, feat_matrix[i], Vt_nodes, epsilon, alpha, lower_threshold, walkCache); 52 | else 53 | ppr.calc_ppr_walk(_gstruct, feat_matrix[i], Vt_nodes, epsilon, alpha, lower_threshold); 54 | #else 55 | ppr.calc_ppr_walk(_gstruct, feat_matrix[i], Vt_nodes, epsilon, alpha, lower_threshold); 56 | #endif 57 | time_push[tid] += getCurrentTime() - time_start; 58 | 59 | // Save embedding vector of feature i on all nodes to feat_matrix in place 60 | time_start = getCurrentTime(); 61 | feat_matrix[i].swap(_gstruct.means); // feat_matrix[i] = _gstruct.means; 62 | time_write[tid] += getCurrentTime() - time_start; 63 | } 64 | 65 | void push_thread(const NInt feat_left, const NInt feat_right, const NInt tid) { 66 | // cout<<" Pushing: "< gstruct(V_num); 68 | for (NInt i = feat_left; i < feat_right; i++) { 69 | push_one(i, tid, gstruct); 70 | } 71 | time_init[tid] = gstruct.time_init; 72 | time_fp[tid] = gstruct.time_fp; 73 | time_it[tid] = gstruct.time_it; 74 | time_rw[tid] = gstruct.time_rw; 75 | } 76 | 77 | public: 78 | 79 | FeatProc(Graph &_graph, Param &_param) : 80 | V_num(_graph.getNumOfVertices()), 81 | ppr(_graph), 82 | #ifdef ENABLE_RW 83 | walkCache(_graph), 84 | #endif 85 | feat_matrix(2), 86 | epsilon(_param.epsilon), 87 | alpha(_param.alpha), 88 | lower_threshold(1.0 / _graph.getNumOfVertices()), 89 | graph(_graph), 90 | param(_param) { 91 | printf("Adj RSS RAM: %.3f GB\n", get_stat_memory()); 92 | Vt_num = load_query(Vt_nodes, param.query_file, V_num); 93 | feat_v2d.load_npy(param.feature_file); 94 | feat_size = feat_v2d.nrows(); 95 | feat_matrix.set_size(feat_size, V_num); 96 | feat_matrix.from_V2D(feat_v2d, Vt_nodes); 97 | printf("Max RSS PRAM: %.3f GB\n", get_proc_memory()); 98 | #ifdef ENABLE_RW 99 | // Perform cached random walk 100 | if (param.index) { 101 | graph.set_dummy_neighbor(graph.get_dummy_id()); 102 | // walkCache.generate(); 103 | graph.reset_set_dummy_neighbor(); 104 | } 105 | #endif 106 | graph.fill_dead_end_neighbor_with_id(); 107 | 108 | thread_num = (NInt) param.thread_num; 109 | thd_size = (feat_size + thread_num - 1) / thread_num; 110 | time_read.resize(thread_num, 0); 111 | time_write.resize(thread_num, 0); 112 | time_push.resize(thread_num, 0); 113 | time_init.resize(thread_num, 0); 114 | time_fp.resize(thread_num, 0); 115 | time_it.resize(thread_num, 0); 116 | time_rw.resize(thread_num, 0); 117 | } 118 | 119 | void save_output(const NInt feat_left, const NInt feat_right) { 120 | if (param.output_estimations) { 121 | std::stringstream res_file; 122 | res_file << param.estimation_folder << "/score_" << param.alpha << '_' << param.epsilon << ".npy"; 123 | feat_matrix.to_V2D(feat_v2d); 124 | feat_v2d.save_npy(res_file.str()); 125 | } 126 | } 127 | 128 | void show_statistics() { 129 | printf("%s\n", std::string(80, '-').c_str()); 130 | printf("Max RURSS PRAM: %.3f GB\n", get_proc_memory()); 131 | printf("End RSS RAM: %.3f GB\n", get_stat_memory()); 132 | printf("Total Time : %.6f, Average: %.8f / node-thread\n", time_total, time_total * thread_num / feat_size); 133 | printf("Push Time Sum: %.6f, Average: %.8f / thread\n", vector_L1(time_push), vector_L1(time_push) / thread_num); 134 | printf(" Init Sum: %.6f, Average: %.8f / thread\n", vector_L1(time_init), vector_L1(time_init) / thread_num); 135 | printf(" FwdPush Sum: %.6f, Average: %.8f / thread\n", vector_L1(time_fp), vector_L1(time_fp) / thread_num); 136 | printf(" PwrIter Sum: %.6f, Average: %.8f / thread\n", vector_L1(time_it), vector_L1(time_it) / thread_num); 137 | printf(" RW Sum: %.6f, Average: %.8f / thread\n", vector_L1(time_rw), vector_L1(time_rw) / thread_num); 138 | // printf("Read Time Sum: %.6f, Average: %.8f / thread\n", vector_L1(time_read), vector_L1(time_read) / thread_num); 139 | printf("Write Time Sum: %.6f, Average: %.8f / thread\n", vector_L1(time_write), vector_L1(time_write) / thread_num); 140 | } 141 | 142 | void push() { 143 | std::vector threads; 144 | 145 | double time_start = getCurrentTime(); 146 | for (NInt thd_left = 0; thd_left < feat_size; thd_left += thd_size) { 147 | NInt thd_right = std::min(feat_size, thd_left + thd_size); 148 | NInt tid = thd_left / thd_size; 149 | threads.emplace_back(std::thread(&FeatProc::push_thread, this, thd_left, thd_right, tid)); 150 | } 151 | for (auto &t : threads) { 152 | t.join(); 153 | } 154 | time_total += getCurrentTime() - time_start; 155 | 156 | save_output(0, feat_size); 157 | } 158 | 159 | // No threading, debug use 160 | void push_single() { 161 | double time_start = getCurrentTime(); 162 | SpeedPPR::GStruct gstruct(V_num); 163 | FltVector seed; 164 | for (NInt i = 0; i < feat_size; i++) { 165 | push_one(i, 0, gstruct); 166 | } 167 | time_total += getCurrentTime() - time_start; 168 | save_output(0, feat_size); 169 | } 170 | 171 | }; 172 | 173 | 174 | // Feat-reuse template 175 | class FeatProc_reuse : public FeatProc { 176 | private: 177 | virtual ScoreFlt reduce_feat(const NInt i, FltVector &base_weight) = 0; 178 | 179 | public: 180 | NInt base_size; 181 | // statistics 182 | std::vector time_reuse; 183 | ScoreFlt avg_tht = 0; // average base coefficient 184 | ScoreFlt avg_res = 0; // average reuse residue 185 | NInt re_feat_num = 0; // number of reused features 186 | 187 | protected: 188 | ScoreFlt gamma; 189 | IntVector base_idx; // index of base features 190 | MyMatrix base_matrix; // matrix of base features 191 | MyMatrix base_result; // output result (on all features and nodes) 192 | ScoreFlt TOL; // tolerance for reuse coefficient 193 | 194 | protected: 195 | 196 | void push_one_base(const NInt idx, const NInt tid, SpeedPPR::GStruct &_gstruct) { 197 | // printf("ID: %4" IDFMT " as base\n", idx); 198 | double time_start = getCurrentTime(); 199 | #ifdef ENABLE_RW 200 | if (param.index) 201 | ppr.calc_ppr_cache(_gstruct, feat_matrix[base_idx[idx]], Vt_nodes, epsilon, alpha, lower_threshold, walkCache, gamma); 202 | else 203 | ppr.calc_ppr_walk(_gstruct, feat_matrix[base_idx[idx]], Vt_nodes, epsilon, alpha, lower_threshold, gamma); 204 | #else 205 | ppr.calc_ppr_walk(_gstruct, feat_matrix[base_idx[idx]], Vt_nodes, epsilon, alpha, lower_threshold, gamma); 206 | #endif 207 | time_push[tid] += getCurrentTime() - time_start; 208 | 209 | time_start = getCurrentTime(); 210 | base_result[idx].swap(_gstruct.means); // base_result[idx] = _gstruct.means; 211 | time_write[tid] += getCurrentTime() - time_start; 212 | } 213 | 214 | void push_thread_base(const NInt feat_left, const NInt feat_right, const NInt tid) { 215 | // cout<<" Pushing: "< gstruct(V_num); 217 | for (NInt i = feat_left; i < feat_right; i++) { 218 | push_one_base(i, tid, gstruct); 219 | } 220 | time_init[tid] += gstruct.time_init; 221 | time_fp[tid] += gstruct.time_fp; 222 | time_it[tid] += gstruct.time_it; 223 | time_rw[tid] += gstruct.time_rw; 224 | } 225 | 226 | void push_one_rest(const NInt i, const NInt tid, SpeedPPR::GStruct &_gstruct) { 227 | for (NInt idx = 0; idx < base_size; idx++) { 228 | if (base_idx[idx] == i) { 229 | // printf("ID: %4" IDFMT " is base\n", i); 230 | double time_start = getCurrentTime(); 231 | feat_matrix.copy_row(i, base_result[idx]); 232 | time_write[tid] += getCurrentTime() - time_start; 233 | return; 234 | } 235 | } 236 | 237 | // ===== Base reduction 238 | double time_start = getCurrentTime(); 239 | FltVector base_weight; 240 | ScoreFlt theta_sum = reduce_feat(i, base_weight); 241 | time_reuse[tid] += getCurrentTime() - time_start; 242 | // printf("ID: %4" IDFMT ", theta_sum: %.6f, residue_sum: %.6f\n", i, theta_sum, vector_L1(_raw_seed)); 243 | // Ignore less relevant features 244 | // if (theta_sum < 1.6) return; 245 | avg_tht += theta_sum; 246 | re_feat_num++; 247 | #ifdef DEBUG 248 | ScoreFlt res_i = vector_L1(feat_matrix[i]); 249 | avg_res += res_i; 250 | // if (i < 10) 251 | // cout<<"Re "< gstruct(V_num); 290 | for (NInt i = feat_left; i < feat_right; i++) { 291 | push_one_rest(i, tid, gstruct); 292 | } 293 | time_init[tid] += gstruct.time_init; 294 | time_fp[tid] += gstruct.time_fp; 295 | time_it[tid] += gstruct.time_it; 296 | time_rw[tid] += gstruct.time_rw; 297 | } 298 | 299 | public: 300 | 301 | FeatProc_reuse(Graph &_graph, Param &_param) : 302 | FeatProc(_graph, _param), 303 | gamma(_param.gamma), 304 | base_size(std::max(NInt (3u), NInt (feat_size * param.base_ratio))), 305 | base_matrix(base_size, V_num, 2), 306 | base_result(base_size, V_num, 2) { 307 | time_reuse.resize(thread_num, 0); 308 | } 309 | 310 | void show_statistics() { 311 | avg_tht /= re_feat_num; 312 | MSG(avg_tht); 313 | #ifdef DEBUG 314 | avg_res /= re_feat_num; 315 | MSG(avg_res); 316 | #endif 317 | MSG(re_feat_num); 318 | FeatProc::show_statistics(); 319 | printf("Reuse Time Sum: %.6f, Average: %.8f / thread\n", vector_L1(time_reuse), vector_L1(time_reuse) / thread_num); 320 | } 321 | 322 | void push() { 323 | // Calculate base PPR 324 | NInt thread_num_base = std::min(base_size, thread_num); 325 | NInt thd_size_base = (base_size + thread_num_base - 1) / thread_num_base; 326 | std::vector threads_base; 327 | 328 | double time_start = getCurrentTime(); 329 | for (NInt thd_left = 0; thd_left < base_size; thd_left += thd_size_base) { 330 | NInt thd_right = std::min(base_size, thd_left + thd_size_base); 331 | NInt tid = thd_left / thd_size_base; 332 | threads_base.emplace_back(std::thread(&FeatProc_reuse::push_thread_base, this, thd_left, thd_right, tid)); 333 | } 334 | for (auto &t : threads_base) { 335 | t.join(); 336 | } 337 | time_total += getCurrentTime() - time_start; 338 | printf("Time Used on Base %.6f\n", time_total); 339 | 340 | // Calculate rest PPR 341 | std::vector threads; 342 | 343 | time_start = getCurrentTime(); 344 | for (NInt thd_left = 0; thd_left < feat_size; thd_left += thd_size) { 345 | NInt thd_right = std::min(feat_size, thd_left + thd_size); 346 | NInt tid = thd_left / thd_size; 347 | threads.emplace_back(std::thread(&FeatProc_reuse::push_thread_rest, this, thd_left, thd_right, tid)); 348 | } 349 | for (auto &t : threads) { 350 | t.join(); 351 | } 352 | time_total += getCurrentTime() - time_start; 353 | 354 | save_output(0, feat_size); 355 | } 356 | 357 | // No multithreading, debug use 358 | void push_single() { 359 | SpeedPPR::GStruct gstruct(V_num); 360 | // Calculate base PPR 361 | double time_start = getCurrentTime(); 362 | for(NInt i = 0; i < base_size; i++){ 363 | push_one_base(i, 0, gstruct); 364 | } 365 | time_total += getCurrentTime() - time_start; 366 | printf("Time Used on Base %.6f\n", time_total); 367 | // Calculate rest PPR 368 | time_start = getCurrentTime(); 369 | for (NInt i = 0; i < feat_size; i++) { 370 | push_one_rest(i, 0, gstruct); 371 | } 372 | time_total += getCurrentTime() - time_start; 373 | save_output(0, feat_size); 374 | } 375 | 376 | }; 377 | 378 | // Feat-reuse: greedy 379 | class FeatProc_greedy : public FeatProc_reuse { 380 | private: 381 | MyMatrix base_inv; 382 | 383 | /* Compute coefficients (theta) of bases, feat_matrix[i] is updated to residue */ 384 | ScoreFlt reduce_feat(const NInt i, FltVector &base_weight) { 385 | base_weight.resize(base_size); 386 | base_weight = reuse_weight(feat_matrix[i], base_matrix); 387 | 388 | /* Least square regression on x = B * w */ 389 | // std::fill(base_weight.begin(), base_weight.end(), 0.0); 390 | // for (NInt idx = 0; idx < base_size; idx++) { 391 | // for (NInt j = 0; j < V_num; j++) { 392 | // base_weight[idx] += feat_matrix[i][j] * base_inv[idx][j]; 393 | // } 394 | // if (fabs(base_weight[idx]) > TOL) { 395 | // for (NInt j = 0; j < V_num; j++) 396 | // feat_matrix[i][j] -= base_matrix[idx][j] * base_weight[idx]; 397 | // } 398 | // } 399 | return vector_L1(base_weight); 400 | } 401 | 402 | public: 403 | 404 | FeatProc_greedy (Graph &_graph, Param &_param) : 405 | FeatProc_reuse(_graph, _param), 406 | base_inv(base_size, V_num) { 407 | TOL = 1e-2; 408 | } 409 | 410 | void fit() { 411 | base_idx = select_base(feat_matrix, base_size); 412 | base_matrix.copy_rows(base_idx, feat_matrix); 413 | // ScoreMatrix base_Matrix_ = base_matrix.to_Eigen(); 414 | // RandomizedSvd rsvd(base_Matrix_, base_size); 415 | // ScoreMatrix base_Inv_ = rsvd.pinv(); 416 | // base_Inv_.transposeInPlace(); 417 | // base_inv.from_Eigen(base_Inv_); 418 | cout<<"Base size: "<= 0; base_tol_idd--) { 437 | idx = base_idx[base_tol_idd]; 438 | if (fabs(base_weight[idx]) < TOL) { 439 | base_weight[idx] = 0.0f; 440 | } else { 441 | break; 442 | } 443 | } 444 | 445 | // TODO: parallelize 446 | FltVector &feat = feat_matrix[i]; 447 | if (base_tol_idd > -1) { 448 | // Reduce feature by bases 449 | for (NInt idd = 0; idd < base_tol_idd; idd++) { 450 | idx = base_idx[idd]; 451 | for (NInt j = 0; j < V_num; j++) 452 | feat[j] -= base_matrix[idx][j] * base_weight[idx]; 453 | } 454 | idx = base_idx[base_tol_idd]; 455 | const ScoreFlt feat_th = base_size / (V_num * sqrt(V_num)); 456 | for (NInt j = 0; j < V_num; j++) { 457 | feat[j] -= base_matrix[idx][j] * base_weight[idx]; 458 | // Shrink small values 459 | feat[j] = (fabs(feat[j]) < feat_th) ? 0.0f : feat[j]; 460 | } 461 | } 462 | return vector_L1(base_weight); 463 | } 464 | 465 | public: 466 | 467 | FeatProc_pca (Graph &_graph, Param &_param) : 468 | FeatProc_reuse(_graph, _param), 469 | theta_matrix(feat_size, base_size) { 470 | TOL = 1e-2; 471 | } 472 | 473 | void fit() { 474 | ScoreFlt avg_degree = graph.getNumOfEdges() / (ScoreFlt) graph.getNumOfVertices() / 2; 475 | // NInt Vs_num = std::max(3*base_size, NInt (V_num*param.base_ratio)); 476 | NInt Vs_num = ceil(0.1 * V_num); 477 | // NInt Vs_num = V_num; 478 | IntVector Vs_nodes = sample_nodes(Vt_nodes, Vs_num); 479 | ScoreMatrix feat_sample_Matrix = feat_matrix.to_Eigen(Vs_nodes); 480 | base_idx = select_pc(feat_sample_Matrix, theta_matrix, base_size, sqrt(avg_degree)); 481 | base_matrix.copy_rows(base_idx, feat_matrix); 482 | cout<<"Theta size: "< 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include "BasicDefinition.h" 18 | #include "HelperFunctions.h" 19 | 20 | 21 | template 22 | inline void show_vector(const std::string &_header, const std::vector &_vec) { 23 | if (_vec.empty()) { 24 | cout << "Empty Vector." << endl; 25 | } else { 26 | cout << endl << _header; 27 | bool identical = true; 28 | const T &elem = _vec.front(); 29 | std::for_each(_vec.begin(), _vec.end(), [&](const T &e) { identical &= (e == elem); }); 30 | if (identical) { 31 | cout << "\tSize of the Vector: " << _vec.size() << "\t Value of Each Element: " << elem; 32 | } else { 33 | cout << endl; 34 | std::copy(begin(_vec), end(_vec), std::ostream_iterator(cout, "\t")); 35 | } 36 | cout << endl; 37 | } 38 | } 39 | 40 | class Graph { 41 | 42 | private: 43 | NInt numOfVertices = 0; 44 | NInt numOfEdges = 0; 45 | NInt num_deadend_vertices = 0; 46 | NInt sid = 0; 47 | NInt dummy_id = 0; 48 | ScoreFlt alpha = 0.2; 49 | NInt max_size_edge_list = 0; 50 | IntVector out_degrees; 51 | IntVector in_degrees; 52 | IntVector start_pos_in_out_neighbor_lists; 53 | IntVector start_pos_in_appearance_pos_lists; 54 | IntVector out_neighbors_lists; 55 | IntVector appearance_pos_lists; 56 | IntVector deadend_vertices; 57 | 58 | public: 59 | 60 | inline size_t get_num_dead_end() const { 61 | return deadend_vertices.size(); 62 | } 63 | 64 | inline void set_dummy_out_degree_zero() { 65 | out_degrees[dummy_id] = 0; 66 | start_pos_in_out_neighbor_lists[dummy_id + 1] = start_pos_in_out_neighbor_lists[dummy_id]; 67 | } 68 | 69 | inline void set_dummy_neighbor(const NInt &_id) { 70 | out_degrees[dummy_id] = 1; 71 | start_pos_in_out_neighbor_lists[dummy_id + 1] = start_pos_in_out_neighbor_lists[dummy_id] + 1; 72 | out_neighbors_lists[start_pos_in_out_neighbor_lists[dummy_id]] = _id; 73 | } 74 | 75 | inline void reset_set_dummy_neighbor() { 76 | out_degrees[dummy_id] = 0; 77 | out_neighbors_lists[start_pos_in_out_neighbor_lists[dummy_id]] = dummy_id; 78 | set_dummy_out_degree_zero(); 79 | } 80 | 81 | inline const NInt &get_dummy_id() const { 82 | return dummy_id; 83 | } 84 | 85 | inline const NInt &get_sid() const { 86 | return sid; 87 | } 88 | 89 | inline const ScoreFlt &get_alpha() const { 90 | return alpha; 91 | } 92 | 93 | inline void set_alpha(const ScoreFlt _alpha = 0.2) { 94 | alpha = _alpha; 95 | } 96 | 97 | inline void fill_dead_end_neighbor_with_id(const NInt &_id) { 98 | for (NInt index = 0; index < num_deadend_vertices; ++index) { 99 | const NInt &id = deadend_vertices[index]; 100 | const NInt &start = start_pos_in_out_neighbor_lists[id]; 101 | out_neighbors_lists[start] = _id; 102 | } 103 | } 104 | 105 | inline void fill_dead_end_neighbor_with_id() { 106 | //cout<< "num_deadend_vertices: " << num_deadend_vertices <= start_pos_in_out_neighbor_lists[dummy_id + 1]) { 163 | // MSG("Time to check " __FILE__) 164 | // MSG(__LINE__) 165 | // } 166 | assert(_index < start_pos_in_out_neighbor_lists[dummy_id + 1]); 167 | return out_neighbors_lists[_index]; 168 | } 169 | 170 | inline const NInt &getNumOfEdges() const { 171 | return numOfEdges; 172 | } 173 | 174 | 175 | void read_binary(const std::string &_attribute_file, 176 | const std::string &_graph_file) { 177 | { 178 | std::string line; 179 | std::ifstream attribute_file(_attribute_file); 180 | if (attribute_file.is_open()) { 181 | std::getline(attribute_file, line); 182 | size_t start1 = line.find_first_of('='); 183 | numOfVertices = std::stoul(line.substr(start1 + 1)); 184 | std::getline(attribute_file, line); 185 | size_t start2 = line.find_first_of('='); 186 | numOfEdges = std::stoul(line.substr(start2 + 1)); 187 | dummy_id = numOfVertices; 188 | // printf("The Number of Vertices: %" IDFMT "\n", numOfVertices); 189 | // printf("The Number of Edges: %" IDFMT "\n", numOfEdges); 190 | attribute_file.close(); 191 | } else { 192 | printf(__FILE__ "; LINE %d; File Not Exists.\n", __LINE__); 193 | cout << _attribute_file << endl; 194 | exit(1); 195 | } 196 | } 197 | // const auto start = getCurrentTime(); 198 | // create temporary graph 199 | std::vector edges(numOfEdges); 200 | if (std::FILE *f = std::fopen(_graph_file.c_str(), "rb")) { 201 | size_t rtn = std::fread(edges.data(), sizeof edges[0], edges.size(), f); 202 | printf("Edge from fread: %zu\n", rtn); 203 | std::fclose(f); 204 | } else { 205 | printf("Graph::read_binary; File Not Exists.\n"); 206 | cout << _graph_file << endl; 207 | exit(1); 208 | } 209 | // const auto end = getCurrentTime(); 210 | // printf("Time Used For Loading BINARY : %.2f\n", end - start); 211 | 212 | // read the edges 213 | // the ids must be in the range from [0 .... the number of vertices - 1]; 214 | numOfEdges = 0; 215 | out_degrees.clear(); 216 | out_degrees.resize(numOfVertices + 2, 0); 217 | in_degrees.clear(); 218 | in_degrees.resize(numOfVertices + 2, 0); 219 | for (auto &edge : edges) { 220 | const NInt &from_id = edge.from_id; 221 | const NInt &to_id = edge.to_id; 222 | // remove self loop 223 | if (from_id != to_id) { 224 | //the edge read is a directed one 225 | ++out_degrees[from_id]; 226 | ++in_degrees[to_id]; 227 | ++numOfEdges; 228 | } 229 | } 230 | /* final count */ 231 | // printf("%d-th Directed Edge Processed.\n", numOfEdges); 232 | 233 | // sort the adj list 234 | // for (auto &neighbors : matrix) { 235 | // std::sort(neighbors.begin(), neighbors.end()); 236 | // } 237 | 238 | // process the dead_end 239 | NInt degree_max = 0; 240 | deadend_vertices.clear(); 241 | for (NInt i = 0; i < numOfVertices; ++i) { 242 | if (out_degrees[i] == 0) { 243 | deadend_vertices.emplace_back(i); 244 | } 245 | degree_max = std::max(degree_max, out_degrees[i]); 246 | } 247 | num_deadend_vertices = deadend_vertices.size(); 248 | 249 | // process pos_list list 250 | start_pos_in_appearance_pos_lists.clear(); 251 | start_pos_in_appearance_pos_lists.resize(numOfVertices + 2, 0); 252 | for (NInt i = 0, j = 1; j < numOfVertices; ++i, ++j) { 253 | start_pos_in_appearance_pos_lists[j] = start_pos_in_appearance_pos_lists[i] + in_degrees[i]; 254 | } 255 | start_pos_in_appearance_pos_lists[numOfVertices] = numOfEdges; 256 | 257 | // process out list 258 | start_pos_in_out_neighbor_lists.clear(); 259 | start_pos_in_out_neighbor_lists.resize(numOfVertices + 2, 0); 260 | for (NInt current_id = 0, next_id = 1; next_id < numOfVertices + 1; ++current_id, ++next_id) { 261 | start_pos_in_out_neighbor_lists[next_id] = 262 | start_pos_in_out_neighbor_lists[current_id] + std::max(out_degrees[current_id], (NInt) 1u); 263 | } 264 | // process dummy vertex 265 | assert(start_pos_in_out_neighbor_lists[numOfVertices] == numOfEdges + deadend_vertices.size()); 266 | out_degrees[dummy_id] = 0; 267 | start_pos_in_out_neighbor_lists[numOfVertices + 1] = start_pos_in_out_neighbor_lists[numOfVertices]; 268 | 269 | // compute the positions 270 | IntVector out_positions_to_fill(start_pos_in_out_neighbor_lists.begin(), 271 | start_pos_in_out_neighbor_lists.end()); 272 | // fill the edge list 273 | out_neighbors_lists.clear(); 274 | out_neighbors_lists.resize(numOfEdges + num_deadend_vertices + degree_max, 0); 275 | NInt edges_processed = 0; 276 | NInt msg_gap = std::max((NInt) 1u, numOfEdges / 10); 277 | std::vector> position_pair; 278 | position_pair.reserve(numOfEdges); 279 | for (auto &edge : edges) { 280 | const NInt &from_id = edge.from_id; 281 | const NInt &to_id = edge.to_id; 282 | // remove self loop 283 | if (from_id != to_id) { 284 | NInt &out_position = out_positions_to_fill[from_id]; 285 | assert(out_position < out_positions_to_fill[from_id + 1]); 286 | out_neighbors_lists[out_position] = to_id; 287 | position_pair.emplace_back(to_id, out_position); 288 | ++out_position; 289 | ++edges_processed; 290 | // if (edges_processed % msg_gap == 0) { 291 | // printf("%u edges processed.\n", edges_processed); 292 | // } 293 | } 294 | } 295 | edges.clear(); 296 | printf("Edges processed: %" IDFMT "\n", edges_processed); 297 | 298 | // use reverse position 299 | IntVector in_positions_to_fill(start_pos_in_appearance_pos_lists.begin(), 300 | start_pos_in_appearance_pos_lists.end()); 301 | in_positions_to_fill[numOfVertices] = numOfEdges; 302 | const double time_sort_start = getCurrentTime(); 303 | std::sort(position_pair.begin(), position_pair.end(), std::less<>()); 304 | const double time_sort_end = getCurrentTime(); 305 | // MSG(time_sort_end - time_sort_start); 306 | appearance_pos_lists.clear(); 307 | appearance_pos_lists.resize(numOfEdges + num_deadend_vertices + degree_max, 0); 308 | NInt in_pos_pair = 0; 309 | for (const auto &pair : position_pair) { 310 | const NInt &to_id = pair.first; 311 | const NInt &pos = pair.second; 312 | NInt &in_position = in_positions_to_fill[to_id]; 313 | assert(in_position < in_positions_to_fill[to_id + 1]); 314 | appearance_pos_lists[in_position] = pos; 315 | ++in_position; 316 | // if (++in_pos_pair % msg_gap == 0) { 317 | // MSG(in_pos_pair); 318 | // } 319 | } 320 | 321 | printf("Vertices total: %" IDFMT "\n", numOfVertices); 322 | printf("Vertices dead end: %" IDFMT "\n", num_deadend_vertices); 323 | // fill the dummy ids 324 | for (const NInt &id : deadend_vertices) { 325 | out_neighbors_lists[out_positions_to_fill[id]++] = dummy_id; 326 | } 327 | assert(get_neighbor_list_start_pos(get_dummy_id()) == 328 | get_neighbor_list_start_pos(get_dummy_id() + 1)); 329 | const double time_end = getCurrentTime(); 330 | // printf("Graph Build Finished. TIME: %.4f\n", time_end - start); 331 | printf("%s\n", std::string(80, '-').c_str()); 332 | } 333 | 334 | void show() const { 335 | // we need to show the dummy 336 | const NInt num_to_show = std::min(numOfVertices + 1, (NInt) 50u); 337 | // show the first elements 338 | show_vector("The Out Degrees of The Vertices:", 339 | IntVector(out_degrees.data(), out_degrees.data() + num_to_show)); 340 | show_vector("The Start Positions of The Vertices in Out Neighbor Lists:", 341 | IntVector(start_pos_in_out_neighbor_lists.data(), 342 | start_pos_in_out_neighbor_lists.data() + num_to_show)); 343 | show_vector("The In Degrees of The Vertices:", 344 | IntVector(in_degrees.data(), in_degrees.data() + num_to_show)); 345 | show_vector("The Start Positions of The Vertices in Appearance List:", 346 | IntVector(start_pos_in_appearance_pos_lists.data(), 347 | start_pos_in_appearance_pos_lists.data() + num_to_show)); 348 | // assume that the number of vertices >= the number of edges; otherwise, there is a potential bug here. 349 | show_vector("Out Neighbor Lists:", 350 | IntVector(out_neighbors_lists.data(), 351 | out_neighbors_lists.data() + 352 | std::min(numOfEdges + num_deadend_vertices, (NInt) 50u))); 353 | show_vector("The Appearance Positions of Vertices in the Out Neighbor Lists:", 354 | IntVector(appearance_pos_lists.data(), 355 | appearance_pos_lists.data() + std::min(numOfEdges, (NInt) 50u))); 356 | // show_vector("The adj list of the middel vertex", matrix[numOfVertices / 2]); 357 | printf("The position the id appears in outNeighbor List:\n"); 358 | for (NInt id = 0; id < numOfVertices; ++id) { 359 | const NInt &idx_start = start_pos_in_appearance_pos_lists[id]; 360 | const NInt &idx_end = start_pos_in_appearance_pos_lists[id + 1]; 361 | printf("Id:%" IDFMT ";\tPositions: ", id); 362 | for (NInt index = idx_start; index < idx_end; ++index) { 363 | printf("%" IDFMT ", ", appearance_pos_lists[index]); 364 | } 365 | printf("\n"); 366 | } 367 | show_vector("Dead End Vertices List:", 368 | IntVector(deadend_vertices.data(), 369 | deadend_vertices.data() + 370 | std::min(num_deadend_vertices, (NInt) 50u))); 371 | printf("\n%s\n", std::string(80, '-').c_str()); 372 | } 373 | }; 374 | 375 | 376 | class CleanGraph { 377 | NInt numOfVertices = 0; 378 | NInt numOfEdges = 0; 379 | public: 380 | 381 | void clean_graph(const std::string &_input_file, 382 | const std::string &_data_folder) { 383 | std::ifstream inf(_input_file.c_str()); 384 | if (!inf.is_open()) { 385 | printf("CleanGraph::clean_graph; File not exists.\n"); 386 | printf("%s\n", _input_file.c_str()); 387 | exit(1); 388 | } 389 | // status indicator 390 | // printf("\nReading Input Graph\n"); 391 | 392 | std::string line; 393 | /** 394 | * skip the headers, we assume the headers are the comments that 395 | * begins with '#' 396 | */ 397 | while (std::getline(inf, line) && line[0] == '#') {} 398 | if (line.empty() || !isdigit(line[0])) { 399 | printf("Error in CleanGraph::clean_graph. Raw File Format Error.\n"); 400 | printf("%s\n", line.c_str()); 401 | exit(1); 402 | } 403 | // create temporary graph 404 | std::vector edges; 405 | numOfEdges = 0; 406 | /** 407 | * read the raw file 408 | */ 409 | size_t num_lines = 0; 410 | // process the first line 411 | { 412 | NInt fromId, toID; 413 | ++num_lines; 414 | size_t end = 0; 415 | fromId = std::stoul(line, &end); 416 | toID = std::stoul(line.substr(end)); 417 | // remove self-loops 418 | edges.emplace_back(fromId, toID); 419 | } 420 | // read the edges 421 | for (NInt fromId, toID; inf >> fromId >> toID;) { 422 | edges.emplace_back(fromId, toID); 423 | if (++num_lines % 5000000 == 0) { printf("%zu Valid Lines Read.\n", num_lines); } 424 | } 425 | 426 | // close the file 427 | inf.close(); 428 | /* final count */ 429 | printf("%zu Lines Read.\n", num_lines); 430 | numOfEdges = edges.size(); 431 | printf("%" IDFMT "-th Non-Self Loop Edges.\n", numOfEdges); 432 | printf("Finish Reading.\n"); 433 | printf("%s\n", std::string(80, '-').c_str()); 434 | 435 | // find the maximum id 436 | size_t id_max = 0; 437 | size_t id_min = std::numeric_limits::max(); 438 | for (const auto &pair : edges) { 439 | id_max = std::max(id_max, (size_t) std::max(pair.from_id, pair.to_id)); 440 | id_min = std::min(id_min, (size_t) std::min(pair.from_id, pair.to_id)); 441 | } 442 | printf("Minimum ID: %zu, Maximum ID: %zu\n", id_min, id_max); 443 | if (id_max >= std::numeric_limits::max()) { 444 | printf("Warning: Change NInt Type First.\n"); 445 | exit(1); 446 | } 447 | const NInt one_plus_id_max = id_max + 1; 448 | IntVector out_degree(one_plus_id_max, 0); 449 | IntVector in_degree(one_plus_id_max, 0); 450 | // compute the degrees. 451 | for (const auto &edge : edges) { 452 | ++out_degree[edge.from_id]; 453 | ++in_degree[edge.to_id]; 454 | } 455 | // count the number of dead-end vertices 456 | NInt original_dead_end_num = 0; 457 | NInt num_isolated_points = 0; 458 | NInt max_degree = 0; 459 | for (NInt id = 0; id < one_plus_id_max; ++id) { 460 | if (out_degree[id] == 0) { 461 | ++original_dead_end_num; 462 | if (in_degree[id] == 0) { 463 | ++num_isolated_points; 464 | } 465 | } 466 | // compute maximum out degree 467 | max_degree = std::max(out_degree[id], max_degree); 468 | } 469 | printf("The number of dead end vertices: %" IDFMT "\n", original_dead_end_num); 470 | printf("The number of isolated points: %" IDFMT "\n", num_isolated_points); 471 | printf("The maximum out degree is: %" IDFMT "\n", max_degree); 472 | 473 | // we assume the vertice ids are in the arrange of 0 ... numOfVertices - 1 474 | numOfVertices = one_plus_id_max; 475 | 476 | // sort the edges 477 | std::sort(edges.begin(), edges.end()); 478 | 479 | // Write the attribute file 480 | numOfEdges = edges.size(); 481 | std::string attribute_file = _data_folder + '/' + "attribute.txt"; 482 | if (std::FILE *file = std::fopen(attribute_file.c_str(), "w")) { 483 | std::fprintf(file, "n=%" IDFMT "\nm=%" IDFMT "\n", numOfVertices, numOfEdges); 484 | std::fclose(file); 485 | } else { 486 | printf("Graph::clean_graph; File Not Exists.\n"); 487 | printf("%s\n", attribute_file.c_str()); 488 | exit(1); 489 | } 490 | 491 | // write the graph in binary 492 | std::string graph_bin_file = _data_folder + '/' + "graph.bin"; 493 | if (std::FILE *file = std::fopen(graph_bin_file.c_str(), "wb")) { 494 | std::fwrite(edges.data(), sizeof edges[0], edges.size(), file); 495 | printf("Writing Binary Finished.\n"); 496 | std::fclose(file); 497 | } else { 498 | printf("Graph::clean_graph; File Not Exists.\n"); 499 | printf("%s\n", graph_bin_file.c_str()); 500 | exit(1); 501 | } 502 | printf("%s\n", std::string(80, '-').c_str()); 503 | } 504 | }; 505 | 506 | 507 | #endif //SCARA_GRAPH_H 508 | --------------------------------------------------------------------------------